xref: /illumos-gate/usr/src/uts/common/inet/ip/ip6_ire.c (revision a0e56b0eb1fdc159ff8348ca0e77d884bb7d126b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 1990 Mentat Inc.
27  */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 /*
32  * This file contains routines that manipulate Internet Routing Entries (IREs).
33  */
34 #include <sys/types.h>
35 #include <sys/stream.h>
36 #include <sys/stropts.h>
37 #include <sys/ddi.h>
38 #include <sys/cmn_err.h>
39 
40 #include <sys/systm.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <net/if.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <net/if_dl.h>
47 #include <netinet/ip6.h>
48 #include <netinet/icmp6.h>
49 
50 #include <inet/common.h>
51 #include <inet/mi.h>
52 #include <inet/ip.h>
53 #include <inet/ip6.h>
54 #include <inet/ip_ndp.h>
55 #include <inet/ip_if.h>
56 #include <inet/ip_ire.h>
57 #include <inet/ipclassifier.h>
58 #include <inet/nd.h>
59 #include <sys/kmem.h>
60 #include <sys/zone.h>
61 
62 #include <sys/tsol/label.h>
63 #include <sys/tsol/tnet.h>
64 
65 irb_t *ip_forwarding_table_v6[IP6_MASK_TABLE_SIZE];
66 /* This is dynamically allocated in ip_ire_init */
67 irb_t *ip_cache_table_v6;
68 static	ire_t	ire_null;
69 
70 static ire_t	*ire_ihandle_lookup_onlink_v6(ire_t *cire);
71 static	void	ire_report_ftable_v6(ire_t *ire, char *mp);
72 static	void	ire_report_ctable_v6(ire_t *ire, char *mp);
73 static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr,
74     const in6_addr_t *mask, const in6_addr_t *gateway, int type,
75     const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
76     const ts_label_t *tsl, int match_flags);
77 
78 /*
79  * Named Dispatch routine to produce a formatted report on all IREs.
80  * This report is accessed by using the ndd utility to "get" ND variable
81  * "ip_ire_status_v6".
82  */
83 /* ARGSUSED */
84 int
85 ip_ire_report_v6(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
86 {
87 	zoneid_t zoneid;
88 
89 	(void) mi_mpprintf(mp,
90 	    "IRE      " MI_COL_HDRPAD_STR
91 	    "rfq      " MI_COL_HDRPAD_STR
92 	    "stq      " MI_COL_HDRPAD_STR
93 	    " zone mxfrg rtt   rtt_sd ssthresh ref "
94 	    "rtomax tstamp_ok wscale_ok ecn_ok pmtud_ok sack sendpipe recvpipe "
95 	    "in/out/forward type    addr         mask         "
96 	    "src             gateway");
97 	/*
98 	 *   01234567 01234567 01234567 12345 12345 12345 12345  12345678 123
99 	 *   123456 123456789 123456789 123456 12345678 1234 12345678 12345678
100 	 *   in/out/forward xxxxxxxxxx
101 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
102 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
103 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
104 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
105 	 */
106 
107 	/*
108 	 * Because of the ndd constraint, at most we can have 64K buffer
109 	 * to put in all IRE info.  So to be more efficient, just
110 	 * allocate a 64K buffer here, assuming we need that large buffer.
111 	 * This should be OK as only root can do ndd /dev/ip.
112 	 */
113 	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
114 		/* The following may work even if we cannot get a large buf. */
115 		(void) mi_mpprintf(mp, "<< Out of buffer >>\n");
116 		return (0);
117 	}
118 	zoneid = Q_TO_CONN(q)->conn_zoneid;
119 	if (zoneid == GLOBAL_ZONEID)
120 		zoneid = ALL_ZONES;
121 
122 	ire_walk_v6(ire_report_ftable_v6, (char *)mp->b_cont, zoneid);
123 	ire_walk_v6(ire_report_ctable_v6, (char *)mp->b_cont, zoneid);
124 	return (0);
125 }
126 
127 /*
128  * ire_walk routine invoked for ip_ire_report_v6 for each IRE.
129  */
130 static void
131 ire_report_ftable_v6(ire_t *ire, char *mp)
132 {
133 	char	buf1[INET6_ADDRSTRLEN];
134 	char	buf2[INET6_ADDRSTRLEN];
135 	char	buf3[INET6_ADDRSTRLEN];
136 	char	buf4[INET6_ADDRSTRLEN];
137 	uint_t	fo_pkt_count;
138 	uint_t	ib_pkt_count;
139 	int	ref;
140 	in6_addr_t gw_addr_v6;
141 	uint_t	print_len, buf_len;
142 
143 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
144 	if (ire->ire_type & IRE_CACHETABLE)
145 	    return;
146 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
147 	if (buf_len <= 0)
148 		return;
149 
150 	/* Number of active references of this ire */
151 	ref = ire->ire_refcnt;
152 	/* "inbound" to a non local address is a forward */
153 	ib_pkt_count = ire->ire_ib_pkt_count;
154 	fo_pkt_count = 0;
155 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
156 	if (!(ire->ire_type & (IRE_LOCAL|IRE_BROADCAST))) {
157 		fo_pkt_count = ib_pkt_count;
158 		ib_pkt_count = 0;
159 	}
160 
161 	mutex_enter(&ire->ire_lock);
162 	gw_addr_v6 = ire->ire_gateway_addr_v6;
163 	mutex_exit(&ire->ire_lock);
164 
165 	print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
166 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
167 	    "%05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
168 	    "%04d %08d %08d %d/%d/%d %s\n\t%s\n\t%s\n\t%s\n\t%s\n",
169 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
170 	    (int)ire->ire_zoneid,
171 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
172 	    ire->ire_uinfo.iulp_rtt_sd,
173 	    ire->ire_uinfo.iulp_ssthresh, ref,
174 	    ire->ire_uinfo.iulp_rtomax,
175 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
176 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
177 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
178 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
179 	    ire->ire_uinfo.iulp_sack,
180 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
181 	    ib_pkt_count, ire->ire_ob_pkt_count, fo_pkt_count,
182 	    ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type),
183 	    inet_ntop(AF_INET6, &ire->ire_addr_v6, buf1, sizeof (buf1)),
184 	    inet_ntop(AF_INET6, &ire->ire_mask_v6, buf2, sizeof (buf2)),
185 	    inet_ntop(AF_INET6, &ire->ire_src_addr_v6, buf3, sizeof (buf3)),
186 	    inet_ntop(AF_INET6, &gw_addr_v6, buf4, sizeof (buf4)));
187 	if (print_len < buf_len) {
188 		((mblk_t *)mp)->b_wptr += print_len;
189 	} else {
190 		((mblk_t *)mp)->b_wptr += buf_len;
191 	}
192 }
193 
194 /* ire_walk routine invoked for ip_ire_report_v6 for each IRE. */
195 static void
196 ire_report_ctable_v6(ire_t *ire, char *mp)
197 {
198 	char	buf1[INET6_ADDRSTRLEN];
199 	char	buf2[INET6_ADDRSTRLEN];
200 	char	buf3[INET6_ADDRSTRLEN];
201 	char	buf4[INET6_ADDRSTRLEN];
202 	uint_t	fo_pkt_count;
203 	uint_t	ib_pkt_count;
204 	int	ref;
205 	in6_addr_t gw_addr_v6;
206 	uint_t	print_len, buf_len;
207 
208 	if ((ire->ire_type & IRE_CACHETABLE) == 0)
209 		return;
210 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
211 	if (buf_len <= 0)
212 		return;
213 
214 	/* Number of active references of this ire */
215 	ref = ire->ire_refcnt;
216 	/* "inbound" to a non local address is a forward */
217 	ib_pkt_count = ire->ire_ib_pkt_count;
218 	fo_pkt_count = 0;
219 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
220 	if (ire->ire_type & IRE_LOCAL) {
221 		fo_pkt_count = ib_pkt_count;
222 		ib_pkt_count = 0;
223 	}
224 
225 	mutex_enter(&ire->ire_lock);
226 	gw_addr_v6 = ire->ire_gateway_addr_v6;
227 	mutex_exit(&ire->ire_lock);
228 
229 	print_len =  snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
230 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
231 	    "%05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
232 	    "%04d %08d %08d %d/%d/%d %s\n\t%s\n\t%s\n\t%s\n\t%s\n",
233 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
234 	    (int)ire->ire_zoneid,
235 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
236 	    ire->ire_uinfo.iulp_rtt_sd, ire->ire_uinfo.iulp_ssthresh, ref,
237 	    ire->ire_uinfo.iulp_rtomax,
238 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
239 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
240 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
241 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
242 	    ire->ire_uinfo.iulp_sack,
243 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
244 	    ib_pkt_count, ire->ire_ob_pkt_count,
245 	    fo_pkt_count, ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type),
246 	    inet_ntop(AF_INET6, &ire->ire_addr_v6, buf1, sizeof (buf1)),
247 	    inet_ntop(AF_INET6, &ire->ire_mask_v6, buf2, sizeof (buf2)),
248 	    inet_ntop(AF_INET6, &ire->ire_src_addr_v6, buf3, sizeof (buf3)),
249 	    inet_ntop(AF_INET6, &gw_addr_v6, buf4, sizeof (buf4)));
250 	if (print_len < buf_len) {
251 		((mblk_t *)mp)->b_wptr += print_len;
252 	} else {
253 		((mblk_t *)mp)->b_wptr += buf_len;
254 	}
255 }
256 
257 
258 /*
259  * Initialize the ire that is specific to IPv6 part and call
260  * ire_init_common to finish it.
261  */
262 ire_t *
263 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr,
264     const in6_addr_t *v6mask, const in6_addr_t *v6src_addr,
265     const in6_addr_t *v6gateway, uint_t *max_fragp,
266     mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
267     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
268     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
269     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp)
270 {
271 
272 	/*
273 	 * Reject IRE security attribute creation/initialization
274 	 * if system is not running in Trusted mode.
275 	 */
276 	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
277 		return (NULL);
278 
279 	if (fp_mp != NULL) {
280 		/*
281 		 * We can't dupb() here as multiple threads could be
282 		 * calling dupb on the same mp which is incorrect.
283 		 * First dupb() should be called only by one thread.
284 		 */
285 		fp_mp = copyb(fp_mp);
286 		if (fp_mp == NULL)
287 			return (NULL);
288 	}
289 
290 	if (dlureq_mp != NULL) {
291 		/*
292 		 * We can't dupb() here as multiple threads could be
293 		 * calling dupb on the same mp which is incorrect.
294 		 * First dupb() should be called only by one thread.
295 		 */
296 		dlureq_mp = copyb(dlureq_mp);
297 		if (dlureq_mp == NULL) {
298 			if (fp_mp != NULL)
299 				freeb(fp_mp);
300 			return (NULL);
301 		}
302 	}
303 
304 	BUMP_IRE_STATS(ire_stats_v6, ire_stats_alloced);
305 	ire->ire_addr_v6 = *v6addr;
306 
307 	if (v6src_addr != NULL)
308 		ire->ire_src_addr_v6 = *v6src_addr;
309 	if (v6mask != NULL) {
310 		ire->ire_mask_v6 = *v6mask;
311 		ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
312 	}
313 	if (v6gateway != NULL)
314 		ire->ire_gateway_addr_v6 = *v6gateway;
315 
316 	if (type == IRE_CACHE && v6cmask != NULL)
317 		ire->ire_cmask_v6 = *v6cmask;
318 
319 	/*
320 	 * Multirouted packets need to have a fragment header added so that
321 	 * the receiver is able to discard duplicates according to their
322 	 * fragment identifier.
323 	 */
324 	if (type == IRE_CACHE && (flags & RTF_MULTIRT)) {
325 		ire->ire_frag_flag = IPH_FRAG_HDR;
326 	}
327 
328 	/* ire_init_common will free the mblks upon encountering any failure */
329 	if (!ire_init_common(ire, max_fragp, fp_mp, rfq, stq, type, dlureq_mp,
330 	    ipif, NULL, phandle, ihandle, flags, IPV6_VERSION, ulp_info,
331 	    gc, gcgrp))
332 		return (NULL);
333 
334 	return (ire);
335 }
336 
337 /*
338  * Similar to ire_create_v6 except that it is called only when
339  * we want to allocate ire as an mblk e.g. we have a external
340  * resolver. Do we need this in IPv6 ?
341  */
342 ire_t *
343 ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
344     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
345     mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
346     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
347     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
348     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp)
349 {
350 	ire_t	*ire;
351 	ire_t	*ret_ire;
352 	mblk_t	*mp;
353 
354 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
355 
356 	/* Allocate the new IRE. */
357 	mp = allocb(sizeof (ire_t), BPRI_MED);
358 	if (mp == NULL) {
359 		ip1dbg(("ire_create_mp_v6: alloc failed\n"));
360 		return (NULL);
361 	}
362 
363 	ire = (ire_t *)mp->b_rptr;
364 	mp->b_wptr = (uchar_t *)&ire[1];
365 
366 	/* Start clean. */
367 	*ire = ire_null;
368 	ire->ire_mp = mp;
369 	mp->b_datap->db_type = IRE_DB_TYPE;
370 
371 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
372 	    NULL, fp_mp, rfq, stq, type, dlureq_mp, ipif, v6cmask, phandle,
373 	    ihandle, flags, ulp_info, gc, gcgrp);
374 
375 	if (ret_ire == NULL) {
376 		freeb(ire->ire_mp);
377 		return (NULL);
378 	}
379 	return (ire);
380 }
381 
382 /*
383  * ire_create_v6 is called to allocate and initialize a new IRE.
384  *
385  * NOTE : This is called as writer sometimes though not required
386  * by this function.
387  */
388 ire_t *
389 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
390     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
391     uint_t *max_fragp, mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
392     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
393     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
394     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp)
395 {
396 	ire_t	*ire;
397 	ire_t	*ret_ire;
398 
399 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
400 
401 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
402 	if (ire == NULL) {
403 		ip1dbg(("ire_create_v6: alloc failed\n"));
404 		return (NULL);
405 	}
406 	*ire = ire_null;
407 
408 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
409 	    max_fragp, fp_mp, rfq, stq, type, dlureq_mp, ipif, v6cmask, phandle,
410 	    ihandle, flags, ulp_info, gc, gcgrp);
411 
412 	if (ret_ire == NULL) {
413 		kmem_cache_free(ire_cache, ire);
414 		return (NULL);
415 	}
416 	ASSERT(ret_ire == ire);
417 	return (ire);
418 }
419 
420 /*
421  * Find an IRE_INTERFACE for the multicast group.
422  * Allows different routes for multicast addresses
423  * in the unicast routing table (akin to FF::0/8 but could be more specific)
424  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
425  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
426  * specify the interface to join on.
427  *
428  * Supports link-local addresses by following the ipif/ill when recursing.
429  */
430 ire_t *
431 ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid)
432 {
433 	ire_t	*ire;
434 	ipif_t	*ipif = NULL;
435 	int	match_flags = MATCH_IRE_TYPE;
436 	in6_addr_t gw_addr_v6;
437 
438 	ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL,
439 	    zoneid, 0, NULL, MATCH_IRE_DEFAULT);
440 
441 	/* We search a resolvable ire in case of multirouting. */
442 	if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
443 		ire_t *cire = NULL;
444 		/*
445 		 * If the route is not resolvable, the looked up ire
446 		 * may be changed here. In that case, ire_multirt_lookup()
447 		 * IRE_REFRELE the original ire and change it.
448 		 */
449 		(void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW,
450 		    NULL);
451 		if (cire != NULL)
452 			ire_refrele(cire);
453 	}
454 	if (ire == NULL)
455 		return (NULL);
456 	/*
457 	 * Make sure we follow ire_ipif.
458 	 *
459 	 * We need to determine the interface route through
460 	 * which the gateway will be reached. We don't really
461 	 * care which interface is picked if the interface is
462 	 * part of a group.
463 	 */
464 	if (ire->ire_ipif != NULL) {
465 		ipif = ire->ire_ipif;
466 		match_flags |= MATCH_IRE_ILL_GROUP;
467 	}
468 
469 	switch (ire->ire_type) {
470 	case IRE_DEFAULT:
471 	case IRE_PREFIX:
472 	case IRE_HOST:
473 		mutex_enter(&ire->ire_lock);
474 		gw_addr_v6 = ire->ire_gateway_addr_v6;
475 		mutex_exit(&ire->ire_lock);
476 		ire_refrele(ire);
477 		ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0,
478 		    IRE_INTERFACE, ipif, NULL, zoneid, 0,
479 		    NULL, match_flags);
480 		return (ire);
481 	case IRE_IF_NORESOLVER:
482 	case IRE_IF_RESOLVER:
483 		return (ire);
484 	default:
485 		ire_refrele(ire);
486 		return (NULL);
487 	}
488 }
489 
490 /*
491  * Return any local address.  We use this to target ourselves
492  * when the src address was specified as 'default'.
493  * Preference for IRE_LOCAL entries.
494  */
495 ire_t *
496 ire_lookup_local_v6(zoneid_t zoneid)
497 {
498 	ire_t	*ire;
499 	irb_t	*irb;
500 	ire_t	*maybe = NULL;
501 	int i;
502 
503 	for (i = 0; i < ip6_cache_table_size;  i++) {
504 		irb = &ip_cache_table_v6[i];
505 		if (irb->irb_ire == NULL)
506 			continue;
507 		rw_enter(&irb->irb_lock, RW_READER);
508 		for (ire = irb->irb_ire; ire; ire = ire->ire_next) {
509 			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
510 			    ire->ire_zoneid != zoneid &&
511 			    ire->ire_zoneid != ALL_ZONES)
512 				continue;
513 			switch (ire->ire_type) {
514 			case IRE_LOOPBACK:
515 				if (maybe == NULL) {
516 					IRE_REFHOLD(ire);
517 					maybe = ire;
518 				}
519 				break;
520 			case IRE_LOCAL:
521 				if (maybe != NULL) {
522 					ire_refrele(maybe);
523 				}
524 				IRE_REFHOLD(ire);
525 				rw_exit(&irb->irb_lock);
526 				return (ire);
527 			}
528 		}
529 		rw_exit(&irb->irb_lock);
530 	}
531 	return (maybe);
532 }
533 
534 /*
535  * This function takes a mask and returns number of bits set in the
536  * mask (the represented prefix length).  Assumes a contiguous mask.
537  */
538 int
539 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
540 {
541 	int		bits;
542 	int		plen = IPV6_ABITS;
543 	int		i;
544 
545 	for (i = 3; i >= 0; i--) {
546 		if (v6mask->s6_addr32[i] == 0) {
547 			plen -= 32;
548 			continue;
549 		}
550 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
551 		if (bits == 0)
552 			break;
553 		plen -= bits;
554 	}
555 
556 	return (plen);
557 }
558 
559 /*
560  * Convert a prefix length to the mask for that prefix.
561  * Returns the argument bitmask.
562  */
563 in6_addr_t *
564 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
565 {
566 	uint32_t *ptr;
567 
568 	if (plen < 0 || plen > IPV6_ABITS)
569 		return (NULL);
570 	*bitmask = ipv6_all_zeros;
571 
572 	ptr = (uint32_t *)bitmask;
573 	while (plen > 32) {
574 		*ptr++ = 0xffffffffU;
575 		plen -= 32;
576 	}
577 	*ptr = htonl(0xffffffffU << (32 - plen));
578 	return (bitmask);
579 }
580 
581 /*
582  * Add a fully initialized IRE to an appropriate
583  * table based on ire_type.
584  *
585  * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST_REDIRECT
586  * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
587  *
588  * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
589  * and IRE_CACHE.
590  *
591  * NOTE : This function is called as writer though not required
592  * by this function.
593  */
594 int
595 ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
596 {
597 	ire_t	*ire1;
598 	int	mask_table_index;
599 	irb_t	*irb_ptr;
600 	ire_t	**irep;
601 	int	flags;
602 	ire_t	*pire = NULL;
603 	ill_t	*stq_ill;
604 	boolean_t	ndp_g_lock_held = B_FALSE;
605 	ire_t	*ire = *ire_p;
606 	int	error;
607 
608 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
609 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
610 	ASSERT(ire->ire_nce == NULL);
611 
612 	/* Find the appropriate list head. */
613 	switch (ire->ire_type) {
614 	case IRE_HOST:
615 		ire->ire_mask_v6 = ipv6_all_ones;
616 		ire->ire_masklen = IPV6_ABITS;
617 		if ((ire->ire_flags & RTF_SETSRC) == 0)
618 			ire->ire_src_addr_v6 = ipv6_all_zeros;
619 		break;
620 	case IRE_HOST_REDIRECT:
621 		ire->ire_mask_v6 = ipv6_all_ones;
622 		ire->ire_masklen = IPV6_ABITS;
623 		ire->ire_src_addr_v6 = ipv6_all_zeros;
624 		break;
625 	case IRE_CACHE:
626 	case IRE_LOCAL:
627 	case IRE_LOOPBACK:
628 		ire->ire_mask_v6 = ipv6_all_ones;
629 		ire->ire_masklen = IPV6_ABITS;
630 		break;
631 	case IRE_PREFIX:
632 		if ((ire->ire_flags & RTF_SETSRC) == 0)
633 			ire->ire_src_addr_v6 = ipv6_all_zeros;
634 		break;
635 	case IRE_DEFAULT:
636 		if ((ire->ire_flags & RTF_SETSRC) == 0)
637 			ire->ire_src_addr_v6 = ipv6_all_zeros;
638 		break;
639 	case IRE_IF_RESOLVER:
640 	case IRE_IF_NORESOLVER:
641 		break;
642 	default:
643 		printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
644 		    (void *)ire, ire->ire_type);
645 		ire_delete(ire);
646 		*ire_p = NULL;
647 		return (EINVAL);
648 	}
649 
650 	/* Make sure the address is properly masked. */
651 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
652 
653 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
654 		/* IRE goes into Forward Table */
655 		mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
656 		if ((ip_forwarding_table_v6[mask_table_index]) == NULL) {
657 			irb_t *ptr;
658 			int i;
659 
660 			ptr = (irb_t *)mi_zalloc((ip6_ftable_hash_size *
661 			    sizeof (irb_t)));
662 			if (ptr == NULL) {
663 				ire_delete(ire);
664 				*ire_p = NULL;
665 				return (ENOMEM);
666 			}
667 			for (i = 0; i < ip6_ftable_hash_size; i++) {
668 				rw_init(&ptr[i].irb_lock, NULL,
669 				    RW_DEFAULT, NULL);
670 			}
671 			mutex_enter(&ire_ft_init_lock);
672 			if (ip_forwarding_table_v6[mask_table_index] == NULL) {
673 				ip_forwarding_table_v6[mask_table_index] = ptr;
674 				mutex_exit(&ire_ft_init_lock);
675 			} else {
676 				/*
677 				 * Some other thread won the race in
678 				 * initializing the forwarding table at the
679 				 * same index.
680 				 */
681 				mutex_exit(&ire_ft_init_lock);
682 				for (i = 0; i < ip6_ftable_hash_size; i++) {
683 					rw_destroy(&ptr[i].irb_lock);
684 				}
685 				mi_free(ptr);
686 			}
687 		}
688 		irb_ptr = &(ip_forwarding_table_v6[mask_table_index][
689 		    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
690 		    ip6_ftable_hash_size)]);
691 	} else {
692 		irb_ptr = &(ip_cache_table_v6[IRE_ADDR_HASH_V6(
693 		    ire->ire_addr_v6, ip6_cache_table_size)]);
694 	}
695 	/*
696 	 * For xresolv interfaces (v6 interfaces with an external
697 	 * address resolver), ip_newroute_v6/ip_newroute_ipif_v6
698 	 * are unable to prevent the deletion of the interface route
699 	 * while adding an IRE_CACHE for an on-link destination
700 	 * in the IRE_IF_RESOLVER case, since the ire has to go to
701 	 * the external resolver and return. We can't do a REFHOLD on the
702 	 * associated interface ire for fear of the message being freed
703 	 * if the external resolver can't resolve the address.
704 	 * Here we look up the interface ire in the forwarding table
705 	 * and make sure that the interface route has not been deleted.
706 	 */
707 	if (ire->ire_type == IRE_CACHE &&
708 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) &&
709 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) &&
710 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) {
711 
712 		pire = ire_ihandle_lookup_onlink_v6(ire);
713 		if (pire == NULL) {
714 			ire_delete(ire);
715 			*ire_p = NULL;
716 			return (EINVAL);
717 		}
718 		/* Prevent pire from getting deleted */
719 		IRB_REFHOLD(pire->ire_bucket);
720 		/* Has it been removed already? */
721 		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
722 			IRB_REFRELE(pire->ire_bucket);
723 			ire_refrele(pire);
724 			ire_delete(ire);
725 			*ire_p = NULL;
726 			return (EINVAL);
727 		}
728 	}
729 
730 	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
731 	/*
732 	 * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check
733 	 * for duplicates because :
734 	 *
735 	 * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be
736 	 *    pointing at different ills. A real duplicate is
737 	 *    a match on both ire_ipif and ire_stq.
738 	 *
739 	 * 2) We could have multiple packets trying to create
740 	 *    an IRE_CACHE for the same ill.
741 	 *
742 	 * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants
743 	 * to go out on a particular ill. Rather than looking at the
744 	 * packet, we depend on the above for MATCH_IRE_ILL here.
745 	 *
746 	 * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
747 	 * multiple IRE_CACHES for an ill for the same destination
748 	 * with various scoped addresses i.e represented by ipifs.
749 	 *
750 	 * MATCH_IRE_ILL is done implicitly below for IRE_CACHES.
751 	 */
752 	if (ire->ire_ipif != NULL)
753 		flags |= MATCH_IRE_IPIF;
754 	/*
755 	 * If we are creating hidden ires, make sure we search on
756 	 * this ill (MATCH_IRE_ILL) and a hidden ire, while we are
757 	 * searching for duplicates below. Otherwise we could
758 	 * potentially find an IRE on some other interface
759 	 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
760 	 * shouldn't do this as this will lead to an infinite loop as
761 	 * eventually we need an hidden ire for this packet to go
762 	 * out. MATCH_IRE_ILL is already marked above.
763 	 */
764 	if (ire->ire_marks & IRE_MARK_HIDDEN) {
765 		ASSERT(ire->ire_type == IRE_CACHE);
766 		flags |= MATCH_IRE_MARK_HIDDEN;
767 	}
768 
769 	/*
770 	 * Start the atomic add of the ire. Grab the ill locks,
771 	 * ill_g_usesrc_lock and the bucket lock. Check for condemned.
772 	 * To avoid lock order problems, get the ndp6.ndp_g_lock now itself.
773 	 */
774 	if (ire->ire_type == IRE_CACHE) {
775 		mutex_enter(&ndp6.ndp_g_lock);
776 		ndp_g_lock_held = B_TRUE;
777 	}
778 
779 	/*
780 	 * If ipif or ill is changing ire_atomic_start() may queue the
781 	 * request and return EINPROGRESS.
782 	 */
783 
784 	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
785 	if (error != 0) {
786 		if (ndp_g_lock_held)
787 			mutex_exit(&ndp6.ndp_g_lock);
788 		/*
789 		 * We don't know whether it is a valid ipif or not.
790 		 * So, set it to NULL. This assumes that the ire has not added
791 		 * a reference to the ipif.
792 		 */
793 		ire->ire_ipif = NULL;
794 		ire_delete(ire);
795 		if (pire != NULL) {
796 			IRB_REFRELE(pire->ire_bucket);
797 			ire_refrele(pire);
798 		}
799 		*ire_p = NULL;
800 		return (error);
801 	}
802 	/*
803 	 * To avoid creating ires having stale values for the ire_max_frag
804 	 * we get the latest value atomically here. For more details
805 	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
806 	 * in ip_rput_dlpi_writer
807 	 */
808 	if (ire->ire_max_fragp == NULL) {
809 		if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6))
810 			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
811 		else
812 			ire->ire_max_frag = pire->ire_max_frag;
813 	} else {
814 		uint_t  max_frag;
815 
816 		max_frag = *ire->ire_max_fragp;
817 		ire->ire_max_fragp = NULL;
818 		ire->ire_max_frag = max_frag;
819 	}
820 
821 	/*
822 	 * Atomically check for duplicate and insert in the table.
823 	 */
824 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
825 		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
826 			continue;
827 
828 		if (ire->ire_type == IRE_CACHE) {
829 			/*
830 			 * We do MATCH_IRE_ILL implicitly here for IRE_CACHES.
831 			 * As ire_ipif and ire_stq could point to two
832 			 * different ills, we can't pass just ire_ipif to
833 			 * ire_match_args and get a match on both ills.
834 			 * This is just needed for duplicate checks here and
835 			 * so we don't add an extra argument to
836 			 * ire_match_args for this. Do it locally.
837 			 *
838 			 * NOTE : Currently there is no part of the code
839 			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
840 			 * match for IRE_CACHEs. Thus we don't want to
841 			 * extend the arguments to ire_match_args_v6.
842 			 */
843 			if (ire1->ire_stq != ire->ire_stq)
844 				continue;
845 			/*
846 			 * Multiroute IRE_CACHEs for a given destination can
847 			 * have the same ire_ipif, typically if their source
848 			 * address is forced using RTF_SETSRC, and the same
849 			 * send-to queue. We differentiate them using the parent
850 			 * handle.
851 			 */
852 			if ((ire1->ire_flags & RTF_MULTIRT) &&
853 			    (ire->ire_flags & RTF_MULTIRT) &&
854 			    (ire1->ire_phandle != ire->ire_phandle))
855 				continue;
856 		}
857 		if (ire1->ire_zoneid != ire->ire_zoneid)
858 			continue;
859 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
860 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
861 		    ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL,
862 		    flags)) {
863 			/*
864 			 * Return the old ire after doing a REFHOLD.
865 			 * As most of the callers continue to use the IRE
866 			 * after adding, we return a held ire. This will
867 			 * avoid a lookup in the caller again. If the callers
868 			 * don't want to use it, they need to do a REFRELE.
869 			 */
870 			ip1dbg(("found dup ire existing %p new %p",
871 			    (void *)ire1, (void *)ire));
872 			IRE_REFHOLD(ire1);
873 			if (ndp_g_lock_held)
874 				mutex_exit(&ndp6.ndp_g_lock);
875 			ire_atomic_end(irb_ptr, ire);
876 			ire_delete(ire);
877 			if (pire != NULL) {
878 				/*
879 				 * Assert that it is
880 				 * not yet removed from the list.
881 				 */
882 				ASSERT(pire->ire_ptpn != NULL);
883 				IRB_REFRELE(pire->ire_bucket);
884 				ire_refrele(pire);
885 			}
886 			*ire_p = ire1;
887 			return (0);
888 		}
889 	}
890 	if (ire->ire_type == IRE_CACHE) {
891 		in6_addr_t gw_addr_v6;
892 		ill_t	*ill = ire_to_ill(ire);
893 		char	buf[INET6_ADDRSTRLEN];
894 		nce_t	*nce;
895 
896 		/*
897 		 * All IRE_CACHE types must have a nce.  If this is
898 		 * not the case the entry will not be added. We need
899 		 * to make sure that if somebody deletes the nce
900 		 * after we looked up, they will find this ire and
901 		 * delete the ire. To delete this ire one needs the
902 		 * bucket lock which we are still holding here. So,
903 		 * even if the nce gets deleted after we looked up,
904 		 * this ire  will get deleted.
905 		 *
906 		 * NOTE : Don't need the ire_lock for accessing
907 		 * ire_gateway_addr_v6 as it is appearing first
908 		 * time on the list and rts_setgwr_v6 could not
909 		 * be changing this.
910 		 */
911 		gw_addr_v6 = ire->ire_gateway_addr_v6;
912 		if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
913 			nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE);
914 		} else {
915 			nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE);
916 		}
917 		if (nce == NULL)
918 			goto failed;
919 
920 		/* Pair of refhold, refrele just to get the tracing right */
921 		NCE_REFHOLD_TO_REFHOLD_NOTR(nce);
922 		/*
923 		 * Atomically make sure that new IREs don't point
924 		 * to an NCE that is logically deleted (CONDEMNED).
925 		 * ndp_delete() first marks the NCE CONDEMNED.
926 		 * This ensures that the nce_refcnt won't increase
927 		 * due to new nce_lookups or due to addition of new IREs
928 		 * pointing to this NCE. Then ndp_delete() cleans up
929 		 * existing references. If we don't do it atomically here,
930 		 * ndp_delete() -> nce_ire_delete() will not be able to
931 		 * clean up the IRE list completely, and the nce_refcnt
932 		 * won't go down to zero.
933 		 */
934 		mutex_enter(&nce->nce_lock);
935 		if (ill->ill_flags & ILLF_XRESOLV) {
936 			/*
937 			 * If we used an external resolver, we may not
938 			 * have gone through neighbor discovery to get here.
939 			 * Must update the nce_state before the next check.
940 			 */
941 			if (nce->nce_state == ND_INCOMPLETE)
942 				nce->nce_state = ND_REACHABLE;
943 		}
944 		if (nce->nce_state == ND_INCOMPLETE ||
945 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
946 		    (nce->nce_state == ND_UNREACHABLE)) {
947 failed:
948 			if (ndp_g_lock_held)
949 				mutex_exit(&ndp6.ndp_g_lock);
950 			if (nce != NULL)
951 				mutex_exit(&nce->nce_lock);
952 			ire_atomic_end(irb_ptr, ire);
953 			ip1dbg(("ire_add_v6: No nce for dst %s \n",
954 			    inet_ntop(AF_INET6, &ire->ire_addr_v6,
955 			    buf, sizeof (buf))));
956 			ire_delete(ire);
957 			if (pire != NULL) {
958 				/*
959 				 * Assert that it is
960 				 * not yet removed from the list.
961 				 */
962 				ASSERT(pire->ire_ptpn != NULL);
963 				IRB_REFRELE(pire->ire_bucket);
964 				ire_refrele(pire);
965 			}
966 			if (nce != NULL)
967 				NCE_REFRELE_NOTR(nce);
968 			*ire_p = NULL;
969 			return (EINVAL);
970 		} else {
971 			ire->ire_nce = nce;
972 		}
973 		mutex_exit(&nce->nce_lock);
974 	}
975 	/*
976 	 * Find the first entry that matches ire_addr - provides
977 	 * tail insertion. *irep will be null if no match.
978 	 */
979 	irep = (ire_t **)irb_ptr;
980 	while ((ire1 = *irep) != NULL &&
981 	    !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
982 		irep = &ire1->ire_next;
983 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
984 
985 	if (*irep != NULL) {
986 		/*
987 		 * Find the last ire which matches ire_addr_v6.
988 		 * Needed to do tail insertion among entries with the same
989 		 * ire_addr_v6.
990 		 */
991 		while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
992 		    &ire1->ire_addr_v6)) {
993 			irep = &ire1->ire_next;
994 			ire1 = *irep;
995 			if (ire1 == NULL)
996 				break;
997 		}
998 	}
999 
1000 	if (ire->ire_type == IRE_DEFAULT) {
1001 		/*
1002 		 * We keep a count of default gateways which is used when
1003 		 * assigning them as routes.
1004 		 */
1005 		ipv6_ire_default_count++;
1006 		ASSERT(ipv6_ire_default_count != 0); /* Wraparound */
1007 	}
1008 	/* Insert at *irep */
1009 	ire1 = *irep;
1010 	if (ire1 != NULL)
1011 		ire1->ire_ptpn = &ire->ire_next;
1012 	ire->ire_next = ire1;
1013 	/* Link the new one in. */
1014 	ire->ire_ptpn = irep;
1015 	/*
1016 	 * ire_walk routines de-reference ire_next without holding
1017 	 * a lock. Before we point to the new ire, we want to make
1018 	 * sure the store that sets the ire_next of the new ire
1019 	 * reaches global visibility, so that ire_walk routines
1020 	 * don't see a truncated list of ires i.e if the ire_next
1021 	 * of the new ire gets set after we do "*irep = ire" due
1022 	 * to re-ordering, the ire_walk thread will see a NULL
1023 	 * once it accesses the ire_next of the new ire.
1024 	 * membar_producer() makes sure that the following store
1025 	 * happens *after* all of the above stores.
1026 	 */
1027 	membar_producer();
1028 	*irep = ire;
1029 	ire->ire_bucket = irb_ptr;
1030 	/*
1031 	 * We return a bumped up IRE above. Keep it symmetrical
1032 	 * so that the callers will always have to release. This
1033 	 * helps the callers of this function because they continue
1034 	 * to use the IRE after adding and hence they don't have to
1035 	 * lookup again after we return the IRE.
1036 	 *
1037 	 * NOTE : We don't have to use atomics as this is appearing
1038 	 * in the list for the first time and no one else can bump
1039 	 * up the reference count on this yet.
1040 	 */
1041 	IRE_REFHOLD_LOCKED(ire);
1042 	BUMP_IRE_STATS(ire_stats_v6, ire_stats_inserted);
1043 	irb_ptr->irb_ire_cnt++;
1044 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
1045 		irb_ptr->irb_tmp_ire_cnt++;
1046 
1047 	if (ire->ire_ipif != NULL) {
1048 		ire->ire_ipif->ipif_ire_cnt++;
1049 		if (ire->ire_stq != NULL) {
1050 			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
1051 			stq_ill->ill_ire_cnt++;
1052 		}
1053 	} else {
1054 		ASSERT(ire->ire_stq == NULL);
1055 	}
1056 
1057 	if (ndp_g_lock_held)
1058 		mutex_exit(&ndp6.ndp_g_lock);
1059 	ire_atomic_end(irb_ptr, ire);
1060 
1061 	if (pire != NULL) {
1062 		/* Assert that it is not removed from the list yet */
1063 		ASSERT(pire->ire_ptpn != NULL);
1064 		IRB_REFRELE(pire->ire_bucket);
1065 		ire_refrele(pire);
1066 	}
1067 
1068 	if (ire->ire_type != IRE_CACHE) {
1069 		/*
1070 		 * For ire's with with host mask see if there is an entry
1071 		 * in the cache. If there is one flush the whole cache as
1072 		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
1073 		 * If no entry is found than there is no need to flush the
1074 		 * cache.
1075 		 */
1076 
1077 		if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) {
1078 			ire_t *lire;
1079 			lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL,
1080 			    IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE);
1081 			if (lire != NULL) {
1082 				ire_refrele(lire);
1083 				ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
1084 			}
1085 		} else {
1086 			ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
1087 		}
1088 	}
1089 
1090 	*ire_p = ire;
1091 	return (0);
1092 }
1093 
1094 /*
1095  * Search for all HOST REDIRECT routes that are
1096  * pointing at the specified gateway and
1097  * delete them. This routine is called only
1098  * when a default gateway is going away.
1099  */
1100 static void
1101 ire_delete_host_redirects_v6(const in6_addr_t *gateway)
1102 {
1103 	irb_t *irb_ptr;
1104 	irb_t *irb;
1105 	ire_t *ire;
1106 	in6_addr_t gw_addr_v6;
1107 	int i;
1108 
1109 	/* get the hash table for HOST routes */
1110 	irb_ptr = ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
1111 	if (irb_ptr == NULL)
1112 		return;
1113 	for (i = 0; (i < ip6_ftable_hash_size); i++) {
1114 		irb = &irb_ptr[i];
1115 		IRB_REFHOLD(irb);
1116 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
1117 			if (ire->ire_type != IRE_HOST_REDIRECT)
1118 				continue;
1119 			mutex_enter(&ire->ire_lock);
1120 			gw_addr_v6 = ire->ire_gateway_addr_v6;
1121 			mutex_exit(&ire->ire_lock);
1122 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
1123 				ire_delete(ire);
1124 		}
1125 		IRB_REFRELE(irb);
1126 	}
1127 }
1128 
1129 /*
1130  * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart
1131  * of ip_ire_clookup_and_delete. The difference being this function does not
1132  * return any value. IPv6 processing of a gratuitous ARP, as it stands, is
1133  * different than IPv4 in that, regardless of the presence of a cache entry
1134  * for this address, an ire_walk_v6 is done. Another difference is that unlike
1135  * in the case of IPv4 this does not take an ipif_t argument, since it is only
1136  * called by ip_arp_news and the match is always only on the address.
1137  */
1138 void
1139 ip_ire_clookup_and_delete_v6(const in6_addr_t *addr)
1140 {
1141 	irb_t		*irb;
1142 	ire_t		*cire;
1143 	boolean_t	found = B_FALSE;
1144 
1145 	irb = &ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, ip6_cache_table_size)];
1146 	IRB_REFHOLD(irb);
1147 	for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) {
1148 		if (cire->ire_marks == IRE_MARK_CONDEMNED)
1149 			continue;
1150 		if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) {
1151 
1152 			/* This signifies start of a match */
1153 			if (!found)
1154 				found = B_TRUE;
1155 			if (cire->ire_type == IRE_CACHE) {
1156 				if (cire->ire_nce != NULL)
1157 					ndp_delete(cire->ire_nce);
1158 				ire_delete_v6(cire);
1159 			}
1160 		/* End of the match */
1161 		} else if (found)
1162 			break;
1163 	}
1164 	IRB_REFRELE(irb);
1165 }
1166 
1167 /*
1168  * Delete the specified IRE.
1169  * All calls should use ire_delete().
1170  * Sometimes called as writer though not required by this function.
1171  *
1172  * NOTE : This function is called only if the ire was added
1173  * in the list.
1174  */
1175 void
1176 ire_delete_v6(ire_t *ire)
1177 {
1178 	in6_addr_t gw_addr_v6;
1179 
1180 	ASSERT(ire->ire_refcnt >= 1);
1181 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
1182 
1183 	if (ire->ire_type != IRE_CACHE)
1184 		ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
1185 	if (ire->ire_type == IRE_DEFAULT) {
1186 		/*
1187 		 * when a default gateway is going away
1188 		 * delete all the host redirects pointing at that
1189 		 * gateway.
1190 		 */
1191 		mutex_enter(&ire->ire_lock);
1192 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1193 		mutex_exit(&ire->ire_lock);
1194 		ire_delete_host_redirects_v6(&gw_addr_v6);
1195 	}
1196 }
1197 
1198 /*
1199  * ire_walk routine to delete all IRE_CACHE and IRE_HOST_REDIRECT
1200  * entries.
1201  */
1202 /*ARGSUSED1*/
1203 void
1204 ire_delete_cache_v6(ire_t *ire, char *arg)
1205 {
1206 	char    addrstr1[INET6_ADDRSTRLEN];
1207 	char    addrstr2[INET6_ADDRSTRLEN];
1208 
1209 	if (ire->ire_type & (IRE_CACHE | IRE_HOST_REDIRECT)) {
1210 		ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n",
1211 		    inet_ntop(AF_INET6, &ire->ire_addr_v6,
1212 			addrstr1, sizeof (addrstr1)),
1213 		    ire->ire_type,
1214 		    inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6,
1215 			addrstr2, sizeof (addrstr2))));
1216 		ire_delete(ire);
1217 	}
1218 
1219 }
1220 
1221 /*
1222  * ire_walk routine to delete all IRE_CACHE/IRE_HOST_REDIRECT entries
1223  * that have a given gateway address.
1224  */
1225 void
1226 ire_delete_cache_gw_v6(ire_t *ire, char *addr)
1227 {
1228 	in6_addr_t	*gw_addr = (in6_addr_t *)addr;
1229 	char		buf1[INET6_ADDRSTRLEN];
1230 	char		buf2[INET6_ADDRSTRLEN];
1231 	in6_addr_t	ire_gw_addr_v6;
1232 
1233 	if (!(ire->ire_type & (IRE_CACHE|IRE_HOST_REDIRECT)))
1234 		return;
1235 
1236 	mutex_enter(&ire->ire_lock);
1237 	ire_gw_addr_v6 = ire->ire_gateway_addr_v6;
1238 	mutex_exit(&ire->ire_lock);
1239 
1240 	if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) {
1241 		ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n",
1242 		    inet_ntop(AF_INET6, &ire->ire_src_addr_v6,
1243 		    buf1, sizeof (buf1)),
1244 		    ire->ire_type,
1245 		    inet_ntop(AF_INET6, &ire_gw_addr_v6,
1246 		    buf2, sizeof (buf2))));
1247 		ire_delete(ire);
1248 	}
1249 }
1250 
1251 /*
1252  * Remove all IRE_CACHE entries that match
1253  * the ire specified.  (Sometimes called
1254  * as writer though not required by this function.)
1255  *
1256  * The flag argument indicates if the
1257  * flush request is due to addition
1258  * of new route (IRE_FLUSH_ADD) or deletion of old
1259  * route (IRE_FLUSH_DELETE).
1260  *
1261  * This routine takes only the IREs from the forwarding
1262  * table and flushes the corresponding entries from
1263  * the cache table.
1264  *
1265  * When flushing due to the deletion of an old route, it
1266  * just checks the cache handles (ire_phandle and ire_ihandle) and
1267  * deletes the ones that match.
1268  *
1269  * When flushing due to the creation of a new route, it checks
1270  * if a cache entry's address matches the one in the IRE and
1271  * that the cache entry's parent has a less specific mask than the
1272  * one in IRE. The destination of such a cache entry could be the
1273  * gateway for other cache entries, so we need to flush those as
1274  * well by looking for gateway addresses matching the IRE's address.
1275  */
1276 void
1277 ire_flush_cache_v6(ire_t *ire, int flag)
1278 {
1279 	int i;
1280 	ire_t *cire;
1281 	irb_t *irb;
1282 
1283 	if (ire->ire_type & IRE_CACHE)
1284 	    return;
1285 
1286 	/*
1287 	 * If a default is just created, there is no point
1288 	 * in going through the cache, as there will not be any
1289 	 * cached ires.
1290 	 */
1291 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
1292 		return;
1293 	if (flag == IRE_FLUSH_ADD) {
1294 		/*
1295 		 * This selective flush is
1296 		 * due to the addition of
1297 		 * new IRE.
1298 		 */
1299 		for (i = 0; i < ip6_cache_table_size; i++) {
1300 			irb = &ip_cache_table_v6[i];
1301 			if ((cire = irb->irb_ire) == NULL)
1302 				continue;
1303 			IRB_REFHOLD(irb);
1304 			for (cire = irb->irb_ire; cire != NULL;
1305 			    cire = cire->ire_next) {
1306 				if (cire->ire_type != IRE_CACHE)
1307 					continue;
1308 				/*
1309 				 * If 'cire' belongs to the same subnet
1310 				 * as the new ire being added, and 'cire'
1311 				 * is derived from a prefix that is less
1312 				 * specific than the new ire being added,
1313 				 * we need to flush 'cire'; for instance,
1314 				 * when a new interface comes up.
1315 				 */
1316 				if ((V6_MASK_EQ_2(cire->ire_addr_v6,
1317 				    ire->ire_mask_v6, ire->ire_addr_v6) &&
1318 				    (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <=
1319 				    ire->ire_masklen))) {
1320 					ire_delete(cire);
1321 					continue;
1322 				}
1323 				/*
1324 				 * This is the case when the ire_gateway_addr
1325 				 * of 'cire' belongs to the same subnet as
1326 				 * the new ire being added.
1327 				 * Flushing such ires is sometimes required to
1328 				 * avoid misrouting: say we have a machine with
1329 				 * two interfaces (I1 and I2), a default router
1330 				 * R on the I1 subnet, and a host route to an
1331 				 * off-link destination D with a gateway G on
1332 				 * the I2 subnet.
1333 				 * Under normal operation, we will have an
1334 				 * on-link cache entry for G and an off-link
1335 				 * cache entry for D with G as ire_gateway_addr,
1336 				 * traffic to D will reach its destination
1337 				 * through gateway G.
1338 				 * If the administrator does 'ifconfig I2 down',
1339 				 * the cache entries for D and G will be
1340 				 * flushed. However, G will now be resolved as
1341 				 * an off-link destination using R (the default
1342 				 * router) as gateway. Then D will also be
1343 				 * resolved as an off-link destination using G
1344 				 * as gateway - this behavior is due to
1345 				 * compatibility reasons, see comment in
1346 				 * ire_ihandle_lookup_offlink(). Traffic to D
1347 				 * will go to the router R and probably won't
1348 				 * reach the destination.
1349 				 * The administrator then does 'ifconfig I2 up'.
1350 				 * Since G is on the I2 subnet, this routine
1351 				 * will flush its cache entry. It must also
1352 				 * flush the cache entry for D, otherwise
1353 				 * traffic will stay misrouted until the IRE
1354 				 * times out.
1355 				 */
1356 				if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6,
1357 				    ire->ire_mask_v6, ire->ire_addr_v6)) {
1358 					ire_delete(cire);
1359 					continue;
1360 				}
1361 			}
1362 			IRB_REFRELE(irb);
1363 		}
1364 	} else {
1365 		/*
1366 		 * delete the cache entries based on
1367 		 * handle in the IRE as this IRE is
1368 		 * being deleted/changed.
1369 		 */
1370 		for (i = 0; i < ip6_cache_table_size; i++) {
1371 			irb = &ip_cache_table_v6[i];
1372 			if ((cire = irb->irb_ire) == NULL)
1373 				continue;
1374 			IRB_REFHOLD(irb);
1375 			for (cire = irb->irb_ire; cire != NULL;
1376 			    cire = cire->ire_next) {
1377 				if (cire->ire_type != IRE_CACHE)
1378 					continue;
1379 				if ((cire->ire_phandle == 0 ||
1380 				    cire->ire_phandle != ire->ire_phandle) &&
1381 				    (cire->ire_ihandle == 0 ||
1382 				    cire->ire_ihandle != ire->ire_ihandle))
1383 					continue;
1384 				ire_delete(cire);
1385 			}
1386 			IRB_REFRELE(irb);
1387 		}
1388 	}
1389 }
1390 
1391 /*
1392  * Matches the arguments passed with the values in the ire.
1393  *
1394  * Note: for match types that match using "ipif" passed in, ipif
1395  * must be checked for non-NULL before calling this routine.
1396  */
1397 static boolean_t
1398 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
1399     const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid,
1400     uint32_t ihandle, const ts_label_t *tsl, int match_flags)
1401 {
1402 	in6_addr_t masked_addr;
1403 	in6_addr_t gw_addr_v6;
1404 	ill_t *ire_ill = NULL, *dst_ill;
1405 	ill_t *ipif_ill = NULL;
1406 	ill_group_t *ire_ill_group = NULL;
1407 	ill_group_t *ipif_ill_group = NULL;
1408 	ipif_t	*src_ipif;
1409 
1410 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
1411 	ASSERT(addr != NULL);
1412 	ASSERT(mask != NULL);
1413 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
1414 	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
1415 	    (ipif != NULL && ipif->ipif_isv6));
1416 	ASSERT(!(match_flags & MATCH_IRE_WQ));
1417 
1418 	/*
1419 	 * HIDDEN cache entries have to be looked up specifically with
1420 	 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
1421 	 * when the interface is FAILED or INACTIVE. In that case,
1422 	 * any IRE_CACHES that exists should be marked with
1423 	 * IRE_MARK_HIDDEN. So, we don't really need to match below
1424 	 * for IRE_MARK_HIDDEN. But we do so for consistency.
1425 	 */
1426 	if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
1427 	    (ire->ire_marks & IRE_MARK_HIDDEN))
1428 		return (B_FALSE);
1429 
1430 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
1431 	    ire->ire_zoneid != ALL_ZONES) {
1432 		/*
1433 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
1434 		 * valid and does not match that of ire_zoneid, a failure to
1435 		 * match is reported at this point. Otherwise, since some IREs
1436 		 * that are available in the global zone can be used in local
1437 		 * zones, additional checks need to be performed:
1438 		 *
1439 		 *	IRE_CACHE and IRE_LOOPBACK entries should
1440 		 *	never be matched in this situation.
1441 		 *
1442 		 *	IRE entries that have an interface associated with them
1443 		 *	should in general not match unless they are an IRE_LOCAL
1444 		 *	or in the case when MATCH_IRE_DEFAULT has been set in
1445 		 *	the caller.  In the case of the former, checking of the
1446 		 *	other fields supplied should take place.
1447 		 *
1448 		 *	In the case where MATCH_IRE_DEFAULT has been set,
1449 		 *	all of the ipif's associated with the IRE's ill are
1450 		 *	checked to see if there is a matching zoneid.  If any
1451 		 *	one ipif has a matching zoneid, this IRE is a
1452 		 *	potential candidate so checking of the other fields
1453 		 *	takes place.
1454 		 *
1455 		 *	In the case where the IRE_INTERFACE has a usable source
1456 		 *	address (indicated by ill_usesrc_ifindex) in the
1457 		 *	correct zone then it's permitted to return this IRE
1458 		 */
1459 		if (match_flags & MATCH_IRE_ZONEONLY)
1460 			return (B_FALSE);
1461 		if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK))
1462 			return (B_FALSE);
1463 		/*
1464 		 * Note, IRE_INTERFACE can have the stq as NULL. For
1465 		 * example, if the default multicast route is tied to
1466 		 * the loopback address.
1467 		 */
1468 		if ((ire->ire_type & IRE_INTERFACE) &&
1469 		    (ire->ire_stq != NULL)) {
1470 			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
1471 			/*
1472 			 * If there is a usable source address in the
1473 			 * zone, then it's ok to return an
1474 			 * IRE_INTERFACE
1475 			 */
1476 			if ((dst_ill->ill_usesrc_ifindex != 0) &&
1477 			    (src_ipif = ipif_select_source_v6(dst_ill, addr,
1478 			    RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid))
1479 			    != NULL) {
1480 				ip3dbg(("ire_match_args: src_ipif %p"
1481 				    " dst_ill %p", (void *)src_ipif,
1482 				    (void *)dst_ill));
1483 				ipif_refrele(src_ipif);
1484 			} else {
1485 				ip3dbg(("ire_match_args: src_ipif NULL"
1486 				    " dst_ill %p\n", (void *)dst_ill));
1487 				return (B_FALSE);
1488 			}
1489 		}
1490 		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
1491 		    !(ire->ire_type & IRE_INTERFACE)) {
1492 			ipif_t	*tipif;
1493 
1494 			if ((match_flags & MATCH_IRE_DEFAULT) == 0)
1495 				return (B_FALSE);
1496 			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
1497 			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
1498 			    tipif != NULL; tipif = tipif->ipif_next) {
1499 				if (IPIF_CAN_LOOKUP(tipif) &&
1500 				    (tipif->ipif_flags & IPIF_UP) &&
1501 				    (tipif->ipif_zoneid == zoneid ||
1502 				    tipif->ipif_zoneid == ALL_ZONES))
1503 					break;
1504 			}
1505 			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
1506 			if (tipif == NULL)
1507 				return (B_FALSE);
1508 		}
1509 	}
1510 
1511 	if (match_flags & MATCH_IRE_GW) {
1512 		mutex_enter(&ire->ire_lock);
1513 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1514 		mutex_exit(&ire->ire_lock);
1515 	}
1516 	/*
1517 	 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
1518 	 * somebody wants to send out on a particular interface which
1519 	 * is given by ire_stq and hence use ire_stq to derive the ill
1520 	 * value. ire_ipif for IRE_CACHES is just the
1521 	 * means of getting a source address i.e ire_src_addr_v6 =
1522 	 * ire->ire_ipif->ipif_src_addr_v6.
1523 	 */
1524 	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
1525 		ire_ill = ire_to_ill(ire);
1526 		if (ire_ill != NULL)
1527 			ire_ill_group = ire_ill->ill_group;
1528 		ipif_ill = ipif->ipif_ill;
1529 		ipif_ill_group = ipif_ill->ill_group;
1530 	}
1531 
1532 	/* No ire_addr_v6 bits set past the mask */
1533 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
1534 	    ire->ire_addr_v6));
1535 	V6_MASK_COPY(*addr, *mask, masked_addr);
1536 
1537 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
1538 	    ((!(match_flags & MATCH_IRE_GW)) ||
1539 		IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
1540 	    ((!(match_flags & MATCH_IRE_TYPE)) ||
1541 		(ire->ire_type & type)) &&
1542 	    ((!(match_flags & MATCH_IRE_SRC)) ||
1543 		IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
1544 		&ipif->ipif_v6src_addr)) &&
1545 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
1546 		(ire->ire_ipif == ipif)) &&
1547 	    ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
1548 		(ire->ire_type != IRE_CACHE ||
1549 		ire->ire_marks & IRE_MARK_HIDDEN)) &&
1550 	    ((!(match_flags & MATCH_IRE_ILL)) ||
1551 		(ire_ill == ipif_ill)) &&
1552 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
1553 		(ire->ire_ihandle == ihandle)) &&
1554 	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
1555 		(ire_ill == ipif_ill) ||
1556 		(ire_ill_group != NULL &&
1557 		ire_ill_group == ipif_ill_group)) &&
1558 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
1559 		(!is_system_labeled()) ||
1560 		(tsol_ire_match_gwattr(ire, tsl) == 0))) {
1561 		/* We found the matched IRE */
1562 		return (B_TRUE);
1563 	}
1564 	return (B_FALSE);
1565 }
1566 
1567 /*
1568  * Lookup for a route in all the tables
1569  */
1570 ire_t *
1571 ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1572     const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
1573     zoneid_t zoneid, const ts_label_t *tsl, int flags)
1574 {
1575 	ire_t *ire = NULL;
1576 
1577 	/*
1578 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
1579 	 * MATCH_IRE_ILL is set.
1580 	 */
1581 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
1582 	    (ipif == NULL))
1583 		return (NULL);
1584 
1585 	/*
1586 	 * might be asking for a cache lookup,
1587 	 * This is not best way to lookup cache,
1588 	 * user should call ire_cache_lookup directly.
1589 	 *
1590 	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
1591 	 * in the forwarding table, if the applicable type flags were set.
1592 	 */
1593 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
1594 		ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid,
1595 		    tsl, flags);
1596 		if (ire != NULL)
1597 			return (ire);
1598 	}
1599 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
1600 		ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif,
1601 		    pire, zoneid, 0, tsl, flags);
1602 	}
1603 	return (ire);
1604 }
1605 
1606 /*
1607  * Lookup a route in forwarding table.
1608  * specific lookup is indicated by passing the
1609  * required parameters and indicating the
1610  * match required in flag field.
1611  *
1612  * Looking for default route can be done in three ways
1613  * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field
1614  *    along with other matches.
1615  * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
1616  *    field along with other matches.
1617  * 3) if the destination and mask are passed as zeros.
1618  *
1619  * A request to return a default route if no route
1620  * is found, can be specified by setting MATCH_IRE_DEFAULT
1621  * in flags.
1622  *
1623  * It does not support recursion more than one level. It
1624  * will do recursive lookup only when the lookup maps to
1625  * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
1626  *
1627  * If the routing table is setup to allow more than one level
1628  * of recursion, the cleaning up cache table will not work resulting
1629  * in invalid routing.
1630  *
1631  * Supports link-local addresses by following the ipif/ill when recursing.
1632  *
1633  * NOTE : When this function returns NULL, pire has already been released.
1634  *	  pire is valid only when this function successfully returns an
1635  *	  ire.
1636  */
1637 ire_t *
1638 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1639     const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
1640     zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags)
1641 {
1642 	irb_t *irb_ptr;
1643 	ire_t	*rire;
1644 	ire_t *ire = NULL;
1645 	ire_t	*saved_ire;
1646 	nce_t	*nce;
1647 	int i;
1648 	in6_addr_t gw_addr_v6;
1649 
1650 	ASSERT(addr != NULL);
1651 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
1652 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
1653 	ASSERT(ipif == NULL || ipif->ipif_isv6);
1654 	ASSERT(!(flags & MATCH_IRE_WQ));
1655 
1656 	/*
1657 	 * When we return NULL from this function, we should make
1658 	 * sure that *pire is NULL so that the callers will not
1659 	 * wrongly REFRELE the pire.
1660 	 */
1661 	if (pire != NULL)
1662 		*pire = NULL;
1663 	/*
1664 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
1665 	 * MATCH_IRE_ILL is set.
1666 	 */
1667 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
1668 	    (ipif == NULL))
1669 		return (NULL);
1670 
1671 	/*
1672 	 * If the mask is known, the lookup
1673 	 * is simple, if the mask is not known
1674 	 * we need to search.
1675 	 */
1676 	if (flags & MATCH_IRE_MASK) {
1677 		uint_t masklen;
1678 
1679 		masklen = ip_mask_to_plen_v6(mask);
1680 		if (ip_forwarding_table_v6[masklen] == NULL)
1681 			return (NULL);
1682 		irb_ptr = &(ip_forwarding_table_v6[masklen][
1683 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask, ip6_ftable_hash_size)]);
1684 		rw_enter(&irb_ptr->irb_lock, RW_READER);
1685 		for (ire = irb_ptr->irb_ire; ire != NULL;
1686 		    ire = ire->ire_next) {
1687 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
1688 				continue;
1689 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
1690 			    ipif, zoneid, ihandle, tsl, flags))
1691 				goto found_ire;
1692 		}
1693 		rw_exit(&irb_ptr->irb_lock);
1694 	} else {
1695 		/*
1696 		 * In this case we don't know the mask, we need to
1697 		 * search the table assuming different mask sizes.
1698 		 * we start with 128 bit mask, we don't allow default here.
1699 		 */
1700 		for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) {
1701 			in6_addr_t tmpmask;
1702 
1703 			if ((ip_forwarding_table_v6[i]) == NULL)
1704 				continue;
1705 			(void) ip_plen_to_mask_v6(i, &tmpmask);
1706 			irb_ptr = &ip_forwarding_table_v6[i][
1707 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1708 			    ip6_ftable_hash_size)];
1709 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1710 			for (ire = irb_ptr->irb_ire; ire != NULL;
1711 			    ire = ire->ire_next) {
1712 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1713 					continue;
1714 				if (ire_match_args_v6(ire, addr,
1715 				    &ire->ire_mask_v6, gateway, type, ipif,
1716 				    zoneid, ihandle, tsl, flags))
1717 					goto found_ire;
1718 			}
1719 			rw_exit(&irb_ptr->irb_lock);
1720 		}
1721 	}
1722 
1723 	/*
1724 	 * We come here if no route has yet been found.
1725 	 *
1726 	 * Handle the case where default route is
1727 	 * requested by specifying type as one of the possible
1728 	 * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE).
1729 	 *
1730 	 * If MATCH_IRE_MASK is specified, then the appropriate default route
1731 	 * would have been found above if it exists so it isn't looked up here.
1732 	 * If MATCH_IRE_DEFAULT was also specified, then a default route will be
1733 	 * searched for later.
1734 	 */
1735 	if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE &&
1736 	    (type & (IRE_DEFAULT | IRE_INTERFACE))) {
1737 		if (ip_forwarding_table_v6[0] != NULL) {
1738 			/* addr & mask is zero for defaults */
1739 			irb_ptr = &ip_forwarding_table_v6[0][
1740 			    IRE_ADDR_HASH_V6(ipv6_all_zeros,
1741 			    ip6_ftable_hash_size)];
1742 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1743 			for (ire = irb_ptr->irb_ire; ire != NULL;
1744 			    ire = ire->ire_next) {
1745 
1746 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1747 					continue;
1748 
1749 				if (ire_match_args_v6(ire, addr,
1750 				    &ipv6_all_zeros, gateway, type, ipif,
1751 				    zoneid, ihandle, tsl, flags))
1752 					goto found_ire;
1753 			}
1754 			rw_exit(&irb_ptr->irb_lock);
1755 		}
1756 	}
1757 	/*
1758 	 * We come here only if no route is found.
1759 	 * see if the default route can be used which is allowed
1760 	 * only if the default matching criteria is specified.
1761 	 * The ipv6_ire_default_count tracks the number of IRE_DEFAULT
1762 	 * entries. However, the ip_forwarding_table_v6[0] also contains
1763 	 * interface routes thus the count can be zero.
1764 	 */
1765 	saved_ire = NULL;
1766 	if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
1767 	    MATCH_IRE_DEFAULT) {
1768 		ire_t	*ire_origin;
1769 		uint_t	g_index;
1770 		uint_t	index;
1771 
1772 		if (ip_forwarding_table_v6[0] == NULL)
1773 			return (NULL);
1774 		irb_ptr = &(ip_forwarding_table_v6[0])[0];
1775 
1776 		/*
1777 		 * Keep a tab on the bucket while looking the IRE_DEFAULT
1778 		 * entries. We need to keep track of a particular IRE
1779 		 * (ire_origin) so this ensures that it will not be unlinked
1780 		 * from the hash list during the recursive lookup below.
1781 		 */
1782 		IRB_REFHOLD(irb_ptr);
1783 		ire = irb_ptr->irb_ire;
1784 		if (ire == NULL) {
1785 			IRB_REFRELE(irb_ptr);
1786 			return (NULL);
1787 		}
1788 
1789 		/*
1790 		 * Get the index first, since it can be changed by other
1791 		 * threads. Then get to the right default route skipping
1792 		 * default interface routes if any. As we hold a reference on
1793 		 * the IRE bucket, ipv6_ire_default_count can only increase so
1794 		 * we can't reach the end of the hash list unexpectedly.
1795 		 */
1796 		if (ipv6_ire_default_count != 0) {
1797 			g_index = ipv6_ire_default_index++;
1798 			index = g_index % ipv6_ire_default_count;
1799 			while (index != 0) {
1800 				if (!(ire->ire_type & IRE_INTERFACE))
1801 					index--;
1802 				ire = ire->ire_next;
1803 			}
1804 			ASSERT(ire != NULL);
1805 		} else {
1806 			/*
1807 			 * No default route, so we only have default interface
1808 			 * routes: don't enter the first loop.
1809 			 */
1810 			ire = NULL;
1811 		}
1812 
1813 		/*
1814 		 * Round-robin the default routers list looking for a neighbor
1815 		 * that matches the passed in parameters and is reachable.  If
1816 		 * none found, just return a route from the default router list
1817 		 * if it exists. If we can't find a default route (IRE_DEFAULT),
1818 		 * look for interface default routes.
1819 		 * We start with the ire we found above and we walk the hash
1820 		 * list until we're back where we started, see
1821 		 * ire_get_next_default_ire(). It doesn't matter if default
1822 		 * routes are added or deleted by other threads - we know this
1823 		 * ire will stay in the list because we hold a reference on the
1824 		 * ire bucket.
1825 		 * NB: if we only have interface default routes, ire is NULL so
1826 		 * we don't even enter this loop (see above).
1827 		 */
1828 		ire_origin = ire;
1829 		for (; ire != NULL;
1830 		    ire = ire_get_next_default_ire(ire, ire_origin)) {
1831 
1832 			if (ire_match_args_v6(ire, addr,
1833 			    &ipv6_all_zeros, gateway, type, ipif,
1834 			    zoneid, ihandle, tsl, flags)) {
1835 				int match_flags;
1836 
1837 				/*
1838 				 * We have something to work with.
1839 				 * If we can find a resolved/reachable
1840 				 * entry, we will use this. Otherwise
1841 				 * we'll try to find an entry that has
1842 				 * a resolved cache entry. We will fallback
1843 				 * on this if we don't find anything else.
1844 				 */
1845 				if (saved_ire == NULL)
1846 					saved_ire = ire;
1847 				mutex_enter(&ire->ire_lock);
1848 				gw_addr_v6 = ire->ire_gateway_addr_v6;
1849 				mutex_exit(&ire->ire_lock);
1850 				match_flags = MATCH_IRE_ILL_GROUP |
1851 				    MATCH_IRE_SECATTR;
1852 				rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
1853 				    0, ire->ire_ipif, zoneid, tsl, match_flags);
1854 				if (rire != NULL) {
1855 					nce = rire->ire_nce;
1856 					if (nce != NULL &&
1857 					    NCE_ISREACHABLE(nce) &&
1858 					    nce->nce_flags & NCE_F_ISROUTER) {
1859 						ire_refrele(rire);
1860 						IRE_REFHOLD(ire);
1861 						IRB_REFRELE(irb_ptr);
1862 						goto found_ire_held;
1863 					} else if (nce != NULL &&
1864 					    !(nce->nce_flags &
1865 					    NCE_F_ISROUTER)) {
1866 						/*
1867 						 * Make sure we don't use
1868 						 * this ire
1869 						 */
1870 						if (saved_ire == ire)
1871 							saved_ire = NULL;
1872 					}
1873 					ire_refrele(rire);
1874 				} else if (ipv6_ire_default_count > 1 &&
1875 				    zoneid != ALL_ZONES) {
1876 					/*
1877 					 * When we're in a local zone, we're
1878 					 * only interested in default routers
1879 					 * that are reachable through ipifs
1880 					 * within our zone.
1881 					 * The potentially expensive call to
1882 					 * ire_route_lookup_v6() is avoided when
1883 					 * we have only one default route.
1884 					 */
1885 					match_flags |= MATCH_IRE_TYPE;
1886 					rire = ire_route_lookup_v6(&gw_addr_v6,
1887 					    NULL, NULL, IRE_INTERFACE,
1888 					    ire->ire_ipif, NULL,
1889 					    zoneid, tsl, match_flags);
1890 					if (rire != NULL) {
1891 						ire_refrele(rire);
1892 						saved_ire = ire;
1893 					} else if (saved_ire == ire) {
1894 						/*
1895 						 * Make sure we don't use
1896 						 * this ire
1897 						 */
1898 						saved_ire = NULL;
1899 					}
1900 				}
1901 			}
1902 		}
1903 		if (saved_ire != NULL) {
1904 			ire = saved_ire;
1905 			IRE_REFHOLD(ire);
1906 			IRB_REFRELE(irb_ptr);
1907 			goto found_ire_held;
1908 		} else {
1909 			/*
1910 			 * Look for a interface default route matching the
1911 			 * args passed in. No round robin here. Just pick
1912 			 * the right one.
1913 			 */
1914 			for (ire = irb_ptr->irb_ire; ire != NULL;
1915 			    ire = ire->ire_next) {
1916 
1917 				if (!(ire->ire_type & IRE_INTERFACE))
1918 					continue;
1919 
1920 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1921 					continue;
1922 
1923 				if (ire_match_args_v6(ire, addr,
1924 				    &ipv6_all_zeros, gateway, type, ipif,
1925 				    zoneid, ihandle, tsl, flags)) {
1926 					IRE_REFHOLD(ire);
1927 					IRB_REFRELE(irb_ptr);
1928 					goto found_ire_held;
1929 				}
1930 			}
1931 			IRB_REFRELE(irb_ptr);
1932 		}
1933 	}
1934 	ASSERT(ire == NULL);
1935 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1936 	return (NULL);
1937 found_ire:
1938 	ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0);
1939 	IRE_REFHOLD(ire);
1940 	rw_exit(&irb_ptr->irb_lock);
1941 
1942 found_ire_held:
1943 	if ((flags & MATCH_IRE_RJ_BHOLE) &&
1944 	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
1945 		return (ire);
1946 	}
1947 	/*
1948 	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
1949 	 * or IRE_CACHETABLE type.  If this is a recursive lookup and an
1950 	 * IRE_INTERFACE type was found, return that.  If it was some other
1951 	 * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it
1952 	 * is necessary to fill in the  parent IRE pointed to by pire, and
1953 	 * then lookup the gateway address of  the parent.  For backwards
1954 	 * compatiblity, if this lookup returns an
1955 	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
1956 	 * of lookup is done.
1957 	 */
1958 	if (flags & MATCH_IRE_RECURSIVE) {
1959 		const ipif_t *gw_ipif;
1960 		int match_flags = MATCH_IRE_DSTONLY;
1961 
1962 		if (ire->ire_type & IRE_INTERFACE)
1963 			return (ire);
1964 		if (pire != NULL)
1965 			*pire = ire;
1966 		/*
1967 		 * If we can't find an IRE_INTERFACE or the caller has not
1968 		 * asked for pire, we need to REFRELE the saved_ire.
1969 		 */
1970 		saved_ire = ire;
1971 
1972 		/*
1973 		 * Currently MATCH_IRE_ILL is never used with
1974 		 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
1975 		 * sending out packets as MATCH_IRE_ILL is used only
1976 		 * for communicating with on-link hosts. We can't assert
1977 		 * that here as RTM_GET calls this function with
1978 		 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
1979 		 * We have already used the MATCH_IRE_ILL in determining
1980 		 * the right prefix route at this point. To match the
1981 		 * behavior of how we locate routes while sending out
1982 		 * packets, we don't want to use MATCH_IRE_ILL below
1983 		 * while locating the interface route.
1984 		 */
1985 		if (ire->ire_ipif != NULL)
1986 			match_flags |= MATCH_IRE_ILL_GROUP;
1987 
1988 		mutex_enter(&ire->ire_lock);
1989 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1990 		mutex_exit(&ire->ire_lock);
1991 
1992 		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0,
1993 		    ire->ire_ipif, NULL, zoneid, tsl, match_flags);
1994 		if (ire == NULL) {
1995 			/*
1996 			 * In this case we have to deal with the
1997 			 * MATCH_IRE_PARENT flag, which means the
1998 			 * parent has to be returned if ire is NULL.
1999 			 * The aim of this is to have (at least) a starting
2000 			 * ire when we want to look at all of the ires in a
2001 			 * bucket aimed at a single destination (as is the
2002 			 * case in ip_newroute_v6 for the RTF_MULTIRT
2003 			 * flagged routes).
2004 			 */
2005 			if (flags & MATCH_IRE_PARENT) {
2006 				if (pire != NULL) {
2007 					/*
2008 					 * Need an extra REFHOLD, if the
2009 					 * parent ire is returned via both
2010 					 * ire and pire.
2011 					 */
2012 					IRE_REFHOLD(saved_ire);
2013 				}
2014 				ire = saved_ire;
2015 			} else {
2016 				ire_refrele(saved_ire);
2017 				if (pire != NULL)
2018 					*pire = NULL;
2019 			}
2020 			return (ire);
2021 		}
2022 		if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
2023 			/*
2024 			 * If the caller did not ask for pire, release
2025 			 * it now.
2026 			 */
2027 			if (pire == NULL) {
2028 				ire_refrele(saved_ire);
2029 			}
2030 			return (ire);
2031 		}
2032 		match_flags |= MATCH_IRE_TYPE;
2033 		mutex_enter(&ire->ire_lock);
2034 		gw_addr_v6 = ire->ire_gateway_addr_v6;
2035 		mutex_exit(&ire->ire_lock);
2036 		gw_ipif = ire->ire_ipif;
2037 		ire_refrele(ire);
2038 		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL,
2039 		    (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid,
2040 		    NULL, match_flags);
2041 		if (ire == NULL) {
2042 			/*
2043 			 * In this case we have to deal with the
2044 			 * MATCH_IRE_PARENT flag, which means the
2045 			 * parent has to be returned if ire is NULL.
2046 			 * The aim of this is to have (at least) a starting
2047 			 * ire when we want to look at all of the ires in a
2048 			 * bucket aimed at a single destination (as is the
2049 			 * case in ip_newroute_v6 for the RTF_MULTIRT
2050 			 * flagged routes).
2051 			 */
2052 			if (flags & MATCH_IRE_PARENT) {
2053 				if (pire != NULL) {
2054 					/*
2055 					 * Need an extra REFHOLD, if the
2056 					 * parent ire is returned via both
2057 					 * ire and pire.
2058 					 */
2059 					IRE_REFHOLD(saved_ire);
2060 				}
2061 				ire = saved_ire;
2062 			} else {
2063 				ire_refrele(saved_ire);
2064 				if (pire != NULL)
2065 					*pire = NULL;
2066 			}
2067 			return (ire);
2068 		} else if (pire == NULL) {
2069 			/*
2070 			 * If the caller did not ask for pire, release
2071 			 * it now.
2072 			 */
2073 			ire_refrele(saved_ire);
2074 		}
2075 		return (ire);
2076 	}
2077 
2078 	ASSERT(pire == NULL || *pire == NULL);
2079 	return (ire);
2080 }
2081 
2082 /*
2083  * Delete the IRE cache for the gateway and all IRE caches whose
2084  * ire_gateway_addr_v6 points to this gateway, and allow them to
2085  * be created on demand by ip_newroute_v6.
2086  */
2087 void
2088 ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid)
2089 {
2090 	irb_t *irb;
2091 	ire_t *ire;
2092 
2093 	irb = &ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, ip6_cache_table_size)];
2094 	IRB_REFHOLD(irb);
2095 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2096 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
2097 			continue;
2098 
2099 		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
2100 		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0,
2101 		    IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) {
2102 			ire_delete(ire);
2103 		}
2104 	}
2105 	IRB_REFRELE(irb);
2106 
2107 	ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid);
2108 }
2109 
2110 /*
2111  * Looks up cache table for a route.
2112  * specific lookup can be indicated by
2113  * passing the MATCH_* flags and the
2114  * necessary parameters.
2115  */
2116 ire_t *
2117 ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
2118     int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl,
2119     int flags)
2120 {
2121 	ire_t *ire;
2122 	irb_t *irb_ptr;
2123 	ASSERT(addr != NULL);
2124 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
2125 
2126 	/*
2127 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
2128 	 * MATCH_IRE_ILL is set.
2129 	 */
2130 	if ((flags & (MATCH_IRE_SRC |  MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
2131 	    (ipif == NULL))
2132 		return (NULL);
2133 
2134 	irb_ptr = &ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2135 	    ip6_cache_table_size)];
2136 	rw_enter(&irb_ptr->irb_lock, RW_READER);
2137 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
2138 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
2139 			continue;
2140 
2141 		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
2142 		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, gateway,
2143 		    type, ipif, zoneid, 0, tsl, flags)) {
2144 			IRE_REFHOLD(ire);
2145 			rw_exit(&irb_ptr->irb_lock);
2146 			return (ire);
2147 		}
2148 	}
2149 	rw_exit(&irb_ptr->irb_lock);
2150 	return (NULL);
2151 }
2152 
2153 /*
2154  * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
2155  * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
2156  * to the hidden ones.
2157  *
2158  * In general the zoneid has to match (where ALL_ZONES match all of them).
2159  * But for IRE_LOCAL we also need to handle the case where L2 should
2160  * conceptually loop back the packet. This is necessary since neither
2161  * Ethernet drivers nor Ethernet hardware loops back packets sent to their
2162  * own MAC address. This loopback is needed when the normal
2163  * routes (ignoring IREs with different zoneids) would send out the packet on
2164  * the same ill (or ill group) as the ill with which this IRE_LOCAL is
2165  * associated.
2166  *
2167  * Earlier versions of this code always matched an IRE_LOCAL independently of
2168  * the zoneid. We preserve that earlier behavior when
2169  * ip_restrict_interzone_loopback is turned off.
2170  */
2171 ire_t *
2172 ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
2173     const ts_label_t *tsl)
2174 {
2175 	irb_t *irb_ptr;
2176 	ire_t *ire;
2177 
2178 	irb_ptr = &ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2179 	    ip6_cache_table_size)];
2180 	rw_enter(&irb_ptr->irb_lock, RW_READER);
2181 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
2182 		if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
2183 			continue;
2184 		if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
2185 			/*
2186 			 * Finally, check if the security policy has any
2187 			 * restriction on using this route for the specified
2188 			 * message.
2189 			 */
2190 			if (tsl != NULL &&
2191 			    ire->ire_gw_secattr != NULL &&
2192 			    tsol_ire_match_gwattr(ire, tsl) != 0) {
2193 				continue;
2194 			}
2195 
2196 			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
2197 			    ire->ire_zoneid == ALL_ZONES) {
2198 				IRE_REFHOLD(ire);
2199 				rw_exit(&irb_ptr->irb_lock);
2200 				return (ire);
2201 			}
2202 
2203 			if (ire->ire_type == IRE_LOCAL) {
2204 				if (ip_restrict_interzone_loopback &&
2205 				    !ire_local_ok_across_zones(ire, zoneid,
2206 				    (void *)addr, tsl))
2207 					continue;
2208 
2209 				IRE_REFHOLD(ire);
2210 				rw_exit(&irb_ptr->irb_lock);
2211 				return (ire);
2212 			}
2213 		}
2214 	}
2215 	rw_exit(&irb_ptr->irb_lock);
2216 	return (NULL);
2217 }
2218 
2219 /*
2220  * Locate the interface ire that is tied to the cache ire 'cire' via
2221  * cire->ire_ihandle.
2222  *
2223  * We are trying to create the cache ire for an onlink destn. or
2224  * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER
2225  * case for xresolv interfaces, after the ire has come back from
2226  * an external resolver.
2227  */
2228 static ire_t *
2229 ire_ihandle_lookup_onlink_v6(ire_t *cire)
2230 {
2231 	ire_t	*ire;
2232 	int	match_flags;
2233 	int	i;
2234 	int	j;
2235 	irb_t	*irb_ptr;
2236 
2237 	ASSERT(cire != NULL);
2238 
2239 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
2240 	/*
2241 	 * We know that the mask of the interface ire equals cire->ire_cmask.
2242 	 * (When ip_newroute_v6() created 'cire' for an on-link destn.
2243 	 * it set its cmask from the interface ire's mask)
2244 	 */
2245 	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6,
2246 	    NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
2247 	    NULL, match_flags);
2248 	if (ire != NULL)
2249 		return (ire);
2250 	/*
2251 	 * If we didn't find an interface ire above, we can't declare failure.
2252 	 * For backwards compatibility, we need to support prefix routes
2253 	 * pointing to next hop gateways that are not on-link.
2254 	 *
2255 	 * In the resolver/noresolver case, ip_newroute_v6() thinks
2256 	 * it is creating the cache ire for an onlink destination in 'cire'.
2257 	 * But 'cire' is not actually onlink, because ire_ftable_lookup_v6()
2258 	 * cheated it, by doing ire_route_lookup_v6() twice and returning an
2259 	 * interface ire.
2260 	 *
2261 	 * Eg. default	-	gw1			(line 1)
2262 	 *	gw1	-	gw2			(line 2)
2263 	 *	gw2	-	hme0			(line 3)
2264 	 *
2265 	 * In the above example, ip_newroute_v6() tried to create the cache ire
2266 	 * 'cire' for gw1, based on the interface route in line 3. The
2267 	 * ire_ftable_lookup_v6() above fails, because there is
2268 	 * no interface route to reach gw1. (it is gw2). We fall thru below.
2269 	 *
2270 	 * Do a brute force search based on the ihandle in a subset of the
2271 	 * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise
2272 	 * things become very complex, since we don't have 'pire' in this
2273 	 * case. (Also note that this method is not possible in the offlink
2274 	 * case because we don't know the mask)
2275 	 */
2276 	i = ip_mask_to_plen_v6(&cire->ire_cmask_v6);
2277 	if ((ip_forwarding_table_v6[i]) == NULL)
2278 		return (NULL);
2279 	for (j = 0; j < ip6_ftable_hash_size; j++) {
2280 		irb_ptr = &ip_forwarding_table_v6[i][j];
2281 		rw_enter(&irb_ptr->irb_lock, RW_READER);
2282 		for (ire = irb_ptr->irb_ire; ire != NULL;
2283 		    ire = ire->ire_next) {
2284 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
2285 				continue;
2286 			if ((ire->ire_type & IRE_INTERFACE) &&
2287 			    (ire->ire_ihandle == cire->ire_ihandle)) {
2288 				IRE_REFHOLD(ire);
2289 				rw_exit(&irb_ptr->irb_lock);
2290 				return (ire);
2291 			}
2292 		}
2293 		rw_exit(&irb_ptr->irb_lock);
2294 	}
2295 	return (NULL);
2296 }
2297 
2298 
2299 /*
2300  * Locate the interface ire that is tied to the cache ire 'cire' via
2301  * cire->ire_ihandle.
2302  *
2303  * We are trying to create the cache ire for an offlink destn based
2304  * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
2305  * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in
2306  * the IRE_CACHE case.
2307  */
2308 ire_t *
2309 ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
2310 {
2311 	ire_t	*ire;
2312 	int	match_flags;
2313 	in6_addr_t	gw_addr;
2314 	ipif_t		*gw_ipif;
2315 
2316 	ASSERT(cire != NULL && pire != NULL);
2317 
2318 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
2319 	/*
2320 	 * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only
2321 	 * for on-link hosts. We should never be here for onlink.
2322 	 * Thus, use MATCH_IRE_ILL_GROUP.
2323 	 */
2324 	if (pire->ire_ipif != NULL)
2325 		match_flags |= MATCH_IRE_ILL_GROUP;
2326 	/*
2327 	 * We know that the mask of the interface ire equals cire->ire_cmask.
2328 	 * (When ip_newroute_v6() created 'cire' for an on-link destn. it set
2329 	 * its cmask from the interface ire's mask)
2330 	 */
2331 	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0,
2332 	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
2333 	    NULL, match_flags);
2334 	if (ire != NULL)
2335 		return (ire);
2336 	/*
2337 	 * If we didn't find an interface ire above, we can't declare failure.
2338 	 * For backwards compatibility, we need to support prefix routes
2339 	 * pointing to next hop gateways that are not on-link.
2340 	 *
2341 	 * Assume we are trying to ping some offlink destn, and we have the
2342 	 * routing table below.
2343 	 *
2344 	 * Eg.	default	- gw1		<--- pire	(line 1)
2345 	 *	gw1	- gw2				(line 2)
2346 	 *	gw2	- hme0				(line 3)
2347 	 *
2348 	 * If we already have a cache ire for gw1 in 'cire', the
2349 	 * ire_ftable_lookup_v6 above would have failed, since there is no
2350 	 * interface ire to reach gw1. We will fallthru below.
2351 	 *
2352 	 * Here we duplicate the steps that ire_ftable_lookup_v6() did in
2353 	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
2354 	 * The differences are the following
2355 	 * i.   We want the interface ire only, so we call
2356 	 *	ire_ftable_lookup_v6() instead of ire_route_lookup_v6()
2357 	 * ii.  We look for only prefix routes in the 1st call below.
2358 	 * ii.  We want to match on the ihandle in the 2nd call below.
2359 	 */
2360 	match_flags =  MATCH_IRE_TYPE;
2361 	if (pire->ire_ipif != NULL)
2362 		match_flags |= MATCH_IRE_ILL_GROUP;
2363 
2364 	mutex_enter(&pire->ire_lock);
2365 	gw_addr = pire->ire_gateway_addr_v6;
2366 	mutex_exit(&pire->ire_lock);
2367 	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET,
2368 	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags);
2369 	if (ire == NULL)
2370 		return (NULL);
2371 	/*
2372 	 * At this point 'ire' corresponds to the entry shown in line 2.
2373 	 * gw_addr is 'gw2' in the example above.
2374 	 */
2375 	mutex_enter(&ire->ire_lock);
2376 	gw_addr = ire->ire_gateway_addr_v6;
2377 	mutex_exit(&ire->ire_lock);
2378 	gw_ipif = ire->ire_ipif;
2379 	ire_refrele(ire);
2380 
2381 	match_flags |= MATCH_IRE_IHANDLE;
2382 	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE,
2383 	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
2384 	    NULL, match_flags);
2385 	return (ire);
2386 }
2387 
2388 /*
2389  * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
2390  * ire associated with the specified ipif.
2391  *
2392  * This might occasionally be called when IPIF_UP is not set since
2393  * the IPV6_MULTICAST_IF as well as creating interface routes
2394  * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
2395  *
2396  * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
2397  * the ipif this routine might return NULL.
2398  * (Sometimes called as writer though not required by this function.)
2399  */
2400 ire_t *
2401 ipif_to_ire_v6(const ipif_t *ipif)
2402 {
2403 	ire_t	*ire;
2404 
2405 	ASSERT(ipif->ipif_isv6);
2406 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
2407 		ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
2408 		    IRE_LOOPBACK, ipif, ALL_ZONES, NULL,
2409 		    (MATCH_IRE_TYPE | MATCH_IRE_IPIF));
2410 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
2411 		/* In this case we need to lookup destination address. */
2412 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
2413 		    &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
2414 		    0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
2415 		    MATCH_IRE_MASK));
2416 	} else {
2417 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
2418 		    &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
2419 		    ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
2420 		    MATCH_IRE_MASK));
2421 	}
2422 	return (ire);
2423 }
2424 
2425 /*
2426  * Return B_TRUE if a multirt route is resolvable
2427  * (or if no route is resolved yet), B_FALSE otherwise.
2428  * This only works in the global zone.
2429  */
2430 boolean_t
2431 ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl)
2432 {
2433 	ire_t	*first_fire;
2434 	ire_t	*first_cire;
2435 	ire_t	*fire;
2436 	ire_t	*cire;
2437 	irb_t	*firb;
2438 	irb_t	*cirb;
2439 	int	unres_cnt = 0;
2440 	boolean_t resolvable = B_FALSE;
2441 
2442 	/* Retrieve the first IRE_HOST that matches the destination */
2443 	first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST,
2444 	    NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE |
2445 	    MATCH_IRE_SECATTR);
2446 
2447 	/* No route at all */
2448 	if (first_fire == NULL) {
2449 		return (B_TRUE);
2450 	}
2451 
2452 	firb = first_fire->ire_bucket;
2453 	ASSERT(firb);
2454 
2455 	/* Retrieve the first IRE_CACHE ire for that destination. */
2456 	first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl);
2457 
2458 	/* No resolved route. */
2459 	if (first_cire == NULL) {
2460 		ire_refrele(first_fire);
2461 		return (B_TRUE);
2462 	}
2463 
2464 	/* At least one route is resolved. */
2465 
2466 	cirb = first_cire->ire_bucket;
2467 	ASSERT(cirb);
2468 
2469 	/* Count the number of routes to that dest that are declared. */
2470 	IRB_REFHOLD(firb);
2471 	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
2472 		if (!(fire->ire_flags & RTF_MULTIRT))
2473 			continue;
2474 		if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp))
2475 			continue;
2476 		unres_cnt++;
2477 	}
2478 	IRB_REFRELE(firb);
2479 
2480 
2481 	/* Then subtract the number of routes to that dst that are resolved */
2482 	IRB_REFHOLD(cirb);
2483 	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
2484 	    if (!(cire->ire_flags & RTF_MULTIRT))
2485 		continue;
2486 	    if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
2487 		continue;
2488 	    if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
2489 		continue;
2490 	    unres_cnt--;
2491 	}
2492 	IRB_REFRELE(cirb);
2493 
2494 	/* At least one route is unresolved; search for a resolvable route. */
2495 	if (unres_cnt > 0)
2496 		resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire,
2497 		    MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl);
2498 
2499 	if (first_fire)
2500 		ire_refrele(first_fire);
2501 
2502 	if (first_cire)
2503 		ire_refrele(first_cire);
2504 
2505 	return (resolvable);
2506 }
2507 
2508 
2509 /*
2510  * Return B_TRUE and update *ire_arg and *fire_arg
2511  * if at least one resolvable route is found.
2512  * Return B_FALSE otherwise (all routes are resolved or
2513  * the remaining unresolved routes are all unresolvable).
2514  * This only works in the global zone.
2515  */
2516 boolean_t
2517 ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
2518     const ts_label_t *tsl)
2519 {
2520 	clock_t	delta;
2521 	ire_t	*best_fire = NULL;
2522 	ire_t	*best_cire = NULL;
2523 	ire_t	*first_fire;
2524 	ire_t	*first_cire;
2525 	ire_t	*fire;
2526 	ire_t	*cire;
2527 	irb_t	*firb = NULL;
2528 	irb_t	*cirb = NULL;
2529 	ire_t	*gw_ire;
2530 	boolean_t	already_resolved;
2531 	boolean_t	res;
2532 	in6_addr_t	v6dst;
2533 	in6_addr_t	v6gw;
2534 
2535 	ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, "
2536 	    "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags));
2537 
2538 	ASSERT(ire_arg);
2539 	ASSERT(fire_arg);
2540 
2541 	/* Not an IRE_HOST ire; give up. */
2542 	if ((*fire_arg == NULL) ||
2543 	    ((*fire_arg)->ire_type != IRE_HOST)) {
2544 		return (B_FALSE);
2545 	}
2546 
2547 	/* This is the first IRE_HOST ire for that destination. */
2548 	first_fire = *fire_arg;
2549 	firb = first_fire->ire_bucket;
2550 	ASSERT(firb);
2551 
2552 	mutex_enter(&first_fire->ire_lock);
2553 	v6dst = first_fire->ire_addr_v6;
2554 	mutex_exit(&first_fire->ire_lock);
2555 
2556 	ip2dbg(("ire_multirt_lookup_v6: dst %08x\n",
2557 	    ntohl(V4_PART_OF_V6(v6dst))));
2558 
2559 	/*
2560 	 * Retrieve the first IRE_CACHE ire for that destination;
2561 	 * if we don't find one, no route for that dest is
2562 	 * resolved yet.
2563 	 */
2564 	first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl);
2565 	if (first_cire) {
2566 		cirb = first_cire->ire_bucket;
2567 	}
2568 
2569 	ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire));
2570 
2571 	/*
2572 	 * Search for a resolvable route, giving the top priority
2573 	 * to routes that can be resolved without any call to the resolver.
2574 	 */
2575 	IRB_REFHOLD(firb);
2576 
2577 	if (!IN6_IS_ADDR_MULTICAST(&v6dst)) {
2578 		/*
2579 		 * For all multiroute IRE_HOST ires for that destination,
2580 		 * check if the route via the IRE_HOST's gateway is
2581 		 * resolved yet.
2582 		 */
2583 		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
2584 
2585 			if (!(fire->ire_flags & RTF_MULTIRT))
2586 				continue;
2587 			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
2588 				continue;
2589 
2590 			if (fire->ire_gw_secattr != NULL &&
2591 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
2592 				continue;
2593 			}
2594 
2595 			mutex_enter(&fire->ire_lock);
2596 			v6gw = fire->ire_gateway_addr_v6;
2597 			mutex_exit(&fire->ire_lock);
2598 
2599 			ip2dbg(("ire_multirt_lookup_v6: fire %p, "
2600 			    "ire_addr %08x, ire_gateway_addr %08x\n",
2601 			    (void *)fire,
2602 			    ntohl(V4_PART_OF_V6(fire->ire_addr_v6)),
2603 			    ntohl(V4_PART_OF_V6(v6gw))));
2604 
2605 			already_resolved = B_FALSE;
2606 
2607 			if (first_cire) {
2608 				ASSERT(cirb);
2609 
2610 				IRB_REFHOLD(cirb);
2611 				/*
2612 				 * For all IRE_CACHE ires for that
2613 				 * destination.
2614 				 */
2615 				for (cire = first_cire;
2616 				    cire != NULL;
2617 				    cire = cire->ire_next) {
2618 
2619 					if (!(cire->ire_flags & RTF_MULTIRT))
2620 						continue;
2621 					if (!IN6_ARE_ADDR_EQUAL(
2622 					    &cire->ire_addr_v6, &v6dst))
2623 						continue;
2624 					if (cire->ire_marks &
2625 					    (IRE_MARK_CONDEMNED|
2626 						IRE_MARK_HIDDEN))
2627 						continue;
2628 
2629 					if (cire->ire_gw_secattr != NULL &&
2630 					    tsol_ire_match_gwattr(cire,
2631 					    tsl) != 0) {
2632 						continue;
2633 					}
2634 
2635 					/*
2636 					 * Check if the IRE_CACHE's gateway
2637 					 * matches the IRE_HOST's gateway.
2638 					 */
2639 					if (IN6_ARE_ADDR_EQUAL(
2640 					    &cire->ire_gateway_addr_v6,
2641 					    &v6gw)) {
2642 						already_resolved = B_TRUE;
2643 						break;
2644 					}
2645 				}
2646 				IRB_REFRELE(cirb);
2647 			}
2648 
2649 			/*
2650 			 * This route is already resolved;
2651 			 * proceed with next one.
2652 			 */
2653 			if (already_resolved) {
2654 				ip2dbg(("ire_multirt_lookup_v6: found cire %p, "
2655 				    "already resolved\n", (void *)cire));
2656 				continue;
2657 			}
2658 
2659 			/*
2660 			 * The route is unresolved; is it actually
2661 			 * resolvable, i.e. is there a cache or a resolver
2662 			 * for the gateway?
2663 			 */
2664 			gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL,
2665 			    ALL_ZONES, tsl, MATCH_IRE_RECURSIVE |
2666 			    MATCH_IRE_SECATTR);
2667 
2668 			ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n",
2669 			    (void *)gw_ire));
2670 
2671 			/*
2672 			 * This route can be resolved without any call to the
2673 			 * resolver; if the MULTIRT_CACHEGW flag is set,
2674 			 * give the top priority to this ire and exit the
2675 			 * loop.
2676 			 * This occurs when an resolver reply is processed
2677 			 * through ip_wput_nondata()
2678 			 */
2679 			if ((flags & MULTIRT_CACHEGW) &&
2680 			    (gw_ire != NULL) &&
2681 			    (gw_ire->ire_type & IRE_CACHETABLE)) {
2682 				/*
2683 				 * Release the resolver associated to the
2684 				 * previous candidate best ire, if any.
2685 				 */
2686 				if (best_cire) {
2687 					ire_refrele(best_cire);
2688 					ASSERT(best_fire);
2689 				}
2690 
2691 				best_fire = fire;
2692 				best_cire = gw_ire;
2693 
2694 				ip2dbg(("ire_multirt_lookup_v6: found top prio "
2695 				    "best_fire %p, best_cire %p\n",
2696 				    (void *)best_fire, (void *)best_cire));
2697 				break;
2698 			}
2699 
2700 			/*
2701 			 * Compute the time elapsed since our preceding
2702 			 * attempt to  resolve that route.
2703 			 * If the MULTIRT_USESTAMP flag is set, we take that
2704 			 * route into account only if this time interval
2705 			 * exceeds ip_multirt_resolution_interval;
2706 			 * this prevents us from attempting to resolve a
2707 			 * broken route upon each sending of a packet.
2708 			 */
2709 			delta = lbolt - fire->ire_last_used_time;
2710 			delta = TICK_TO_MSEC(delta);
2711 
2712 			res = (boolean_t)
2713 			    ((delta > ip_multirt_resolution_interval) ||
2714 				(!(flags & MULTIRT_USESTAMP)));
2715 
2716 			ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, "
2717 			    "res %d\n",
2718 			    (void *)fire, delta, res));
2719 
2720 			if (res) {
2721 				/*
2722 				 * A resolver exists for the gateway: save
2723 				 * the current IRE_HOST ire as a candidate
2724 				 * best ire. If we later discover that a
2725 				 * top priority ire exists (i.e. no need to
2726 				 * call the resolver), then this new ire
2727 				 * will be preferred to the current one.
2728 				 */
2729 				if (gw_ire != NULL) {
2730 					if (best_fire == NULL) {
2731 						ASSERT(best_cire == NULL);
2732 
2733 						best_fire = fire;
2734 						best_cire = gw_ire;
2735 
2736 						ip2dbg(("ire_multirt_lookup_v6:"
2737 						    "found candidate "
2738 						    "best_fire %p, "
2739 						    "best_cire %p\n",
2740 						    (void *)best_fire,
2741 						    (void *)best_cire));
2742 
2743 						/*
2744 						 * If MULTIRT_CACHEGW is not
2745 						 * set, we ignore the top
2746 						 * priority ires that can
2747 						 * be resolved without any
2748 						 * call to the resolver;
2749 						 * In that case, there is
2750 						 * actually no need
2751 						 * to continue the loop.
2752 						 */
2753 						if (!(flags &
2754 						    MULTIRT_CACHEGW)) {
2755 							break;
2756 						}
2757 						continue;
2758 					}
2759 				} else {
2760 					/*
2761 					 * No resolver for the gateway: the
2762 					 * route is not resolvable.
2763 					 * If the MULTIRT_SETSTAMP flag is
2764 					 * set, we stamp the IRE_HOST ire,
2765 					 * so we will not select it again
2766 					 * during this resolution interval.
2767 					 */
2768 					if (flags & MULTIRT_SETSTAMP)
2769 						fire->ire_last_used_time =
2770 						    lbolt;
2771 				}
2772 			}
2773 
2774 			if (gw_ire != NULL)
2775 				ire_refrele(gw_ire);
2776 		}
2777 	} else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */
2778 
2779 		for (fire = first_fire;
2780 		    fire != NULL;
2781 		    fire = fire->ire_next) {
2782 
2783 			if (!(fire->ire_flags & RTF_MULTIRT))
2784 				continue;
2785 			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
2786 				continue;
2787 
2788 			if (fire->ire_gw_secattr != NULL &&
2789 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
2790 				continue;
2791 			}
2792 
2793 			already_resolved = B_FALSE;
2794 
2795 			mutex_enter(&fire->ire_lock);
2796 			v6gw = fire->ire_gateway_addr_v6;
2797 			mutex_exit(&fire->ire_lock);
2798 
2799 			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
2800 			    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl,
2801 			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
2802 			    MATCH_IRE_SECATTR);
2803 
2804 			/* No resolver for the gateway; we skip this ire. */
2805 			if (gw_ire == NULL) {
2806 				continue;
2807 			}
2808 
2809 			if (first_cire) {
2810 
2811 				IRB_REFHOLD(cirb);
2812 				/*
2813 				 * For all IRE_CACHE ires for that
2814 				 * destination.
2815 				 */
2816 				for (cire = first_cire;
2817 				    cire != NULL;
2818 				    cire = cire->ire_next) {
2819 
2820 					if (!(cire->ire_flags & RTF_MULTIRT))
2821 						continue;
2822 					if (!IN6_ARE_ADDR_EQUAL(
2823 					    &cire->ire_addr_v6, &v6dst))
2824 						continue;
2825 					if (cire->ire_marks &
2826 					    (IRE_MARK_CONDEMNED|
2827 						IRE_MARK_HIDDEN))
2828 						continue;
2829 
2830 					if (cire->ire_gw_secattr != NULL &&
2831 					    tsol_ire_match_gwattr(cire,
2832 					    tsl) != 0) {
2833 						continue;
2834 					}
2835 
2836 					/*
2837 					 * Cache entries are linked to the
2838 					 * parent routes using the parent handle
2839 					 * (ire_phandle). If no cache entry has
2840 					 * the same handle as fire, fire is
2841 					 * still unresolved.
2842 					 */
2843 					ASSERT(cire->ire_phandle != 0);
2844 					if (cire->ire_phandle ==
2845 					    fire->ire_phandle) {
2846 						already_resolved = B_TRUE;
2847 						break;
2848 					}
2849 				}
2850 				IRB_REFRELE(cirb);
2851 			}
2852 
2853 			/*
2854 			 * This route is already resolved; proceed with
2855 			 * next one.
2856 			 */
2857 			if (already_resolved) {
2858 				ire_refrele(gw_ire);
2859 				continue;
2860 			}
2861 
2862 			/*
2863 			 * Compute the time elapsed since our preceding
2864 			 * attempt to resolve that route.
2865 			 * If the MULTIRT_USESTAMP flag is set, we take
2866 			 * that route into account only if this time
2867 			 * interval exceeds ip_multirt_resolution_interval;
2868 			 * this prevents us from attempting to resolve a
2869 			 * broken route upon each sending of a packet.
2870 			 */
2871 			delta = lbolt - fire->ire_last_used_time;
2872 			delta = TICK_TO_MSEC(delta);
2873 
2874 			res = (boolean_t)
2875 			    ((delta > ip_multirt_resolution_interval) ||
2876 			    (!(flags & MULTIRT_USESTAMP)));
2877 
2878 			ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, "
2879 			    "flags %04x, res %d\n",
2880 			    (void *)fire, delta, flags, res));
2881 
2882 			if (res) {
2883 				if (best_cire) {
2884 					/*
2885 					 * Release the resolver associated
2886 					 * to the preceding candidate best
2887 					 * ire, if any.
2888 					 */
2889 					ire_refrele(best_cire);
2890 					ASSERT(best_fire);
2891 				}
2892 				best_fire = fire;
2893 				best_cire = gw_ire;
2894 				continue;
2895 			}
2896 
2897 			ire_refrele(gw_ire);
2898 		}
2899 	}
2900 
2901 	if (best_fire) {
2902 		IRE_REFHOLD(best_fire);
2903 	}
2904 	IRB_REFRELE(firb);
2905 
2906 	/* Release the first IRE_CACHE we initially looked up, if any. */
2907 	if (first_cire)
2908 		ire_refrele(first_cire);
2909 
2910 	/* Found a resolvable route. */
2911 	if (best_fire) {
2912 		ASSERT(best_cire);
2913 
2914 		if (*fire_arg)
2915 			ire_refrele(*fire_arg);
2916 		if (*ire_arg)
2917 			ire_refrele(*ire_arg);
2918 
2919 		/*
2920 		 * Update the passed arguments with the
2921 		 * resolvable multirt route we found
2922 		 */
2923 		*fire_arg = best_fire;
2924 		*ire_arg = best_cire;
2925 
2926 		ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, "
2927 		    "*fire_arg %p, *ire_arg %p\n",
2928 		    (void *)best_fire, (void *)best_cire));
2929 
2930 		return (B_TRUE);
2931 	}
2932 
2933 	ASSERT(best_cire == NULL);
2934 
2935 	ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, "
2936 	    "*ire_arg %p\n",
2937 	    (void *)*fire_arg, (void *)*ire_arg));
2938 
2939 	/* No resolvable route. */
2940 	return (B_FALSE);
2941 }
2942 
2943 
2944 /*
2945  * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp'
2946  * that goes through 'ipif'. As a fallback, a route that goes through
2947  * ipif->ipif_ill can be returned.
2948  */
2949 ire_t *
2950 ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp)
2951 {
2952 	ire_t	*ire;
2953 	ire_t	*save_ire = NULL;
2954 	ire_t   *gw_ire;
2955 	irb_t   *irb;
2956 	in6_addr_t v6gw;
2957 	int	match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
2958 
2959 	ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
2960 	    NULL, MATCH_IRE_DEFAULT);
2961 
2962 	if (ire == NULL)
2963 		return (NULL);
2964 
2965 	irb = ire->ire_bucket;
2966 	ASSERT(irb);
2967 
2968 	IRB_REFHOLD(irb);
2969 	ire_refrele(ire);
2970 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2971 		if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) ||
2972 		    (ipif->ipif_zoneid != ire->ire_zoneid &&
2973 		    ire->ire_zoneid != ALL_ZONES)) {
2974 			continue;
2975 		}
2976 
2977 		switch (ire->ire_type) {
2978 		case IRE_DEFAULT:
2979 		case IRE_PREFIX:
2980 		case IRE_HOST:
2981 			mutex_enter(&ire->ire_lock);
2982 			v6gw = ire->ire_gateway_addr_v6;
2983 			mutex_exit(&ire->ire_lock);
2984 			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
2985 			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
2986 			    NULL, match_flags);
2987 
2988 			if (gw_ire != NULL) {
2989 				if (save_ire != NULL) {
2990 					ire_refrele(save_ire);
2991 				}
2992 				IRE_REFHOLD(ire);
2993 				if (gw_ire->ire_ipif == ipif) {
2994 					ire_refrele(gw_ire);
2995 
2996 					IRB_REFRELE(irb);
2997 					return (ire);
2998 				}
2999 				ire_refrele(gw_ire);
3000 				save_ire = ire;
3001 			}
3002 			break;
3003 		case IRE_IF_NORESOLVER:
3004 		case IRE_IF_RESOLVER:
3005 			if (ire->ire_ipif == ipif) {
3006 				if (save_ire != NULL) {
3007 					ire_refrele(save_ire);
3008 				}
3009 				IRE_REFHOLD(ire);
3010 
3011 				IRB_REFRELE(irb);
3012 				return (ire);
3013 			}
3014 			break;
3015 		}
3016 	}
3017 	IRB_REFRELE(irb);
3018 
3019 	return (save_ire);
3020 }
3021