xref: /titanic_51/usr/src/uts/common/inet/ip/ip6_ire.c (revision faebf79429f1b4c0f9334ac0f880806edf891f1f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 1990 Mentat Inc.
27  */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 /*
32  * This file contains routines that manipulate Internet Routing Entries (IREs).
33  */
34 #include <sys/types.h>
35 #include <sys/stream.h>
36 #include <sys/stropts.h>
37 #include <sys/ddi.h>
38 #include <sys/cmn_err.h>
39 
40 #include <sys/systm.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <net/if.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <net/if_dl.h>
47 #include <netinet/ip6.h>
48 #include <netinet/icmp6.h>
49 
50 #include <inet/common.h>
51 #include <inet/mi.h>
52 #include <inet/ip.h>
53 #include <inet/ip6.h>
54 #include <inet/ip_ndp.h>
55 #include <inet/ip_if.h>
56 #include <inet/ip_ire.h>
57 #include <inet/ipclassifier.h>
58 #include <inet/nd.h>
59 #include <sys/kmem.h>
60 #include <sys/zone.h>
61 
62 #include <sys/tsol/label.h>
63 #include <sys/tsol/tnet.h>
64 
65 irb_t *ip_forwarding_table_v6[IP6_MASK_TABLE_SIZE];
66 /* This is dynamically allocated in ip_ire_init */
67 irb_t *ip_cache_table_v6;
68 static	ire_t	ire_null;
69 
70 /* Defined in ip_ire.c */
71 extern uint32_t ip6_cache_table_size;
72 extern uint32_t ip6_ftable_hash_size;
73 
74 static ire_t	*ire_ihandle_lookup_onlink_v6(ire_t *cire);
75 static	void	ire_report_ftable_v6(ire_t *ire, char *mp);
76 static	void	ire_report_ctable_v6(ire_t *ire, char *mp);
77 static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr,
78     const in6_addr_t *mask, const in6_addr_t *gateway, int type,
79     const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
80     const ts_label_t *tsl, int match_flags);
81 
82 /*
83  * Named Dispatch routine to produce a formatted report on all IREs.
84  * This report is accessed by using the ndd utility to "get" ND variable
85  * "ip_ire_status_v6".
86  */
87 /* ARGSUSED */
88 int
89 ip_ire_report_v6(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
90 {
91 	zoneid_t zoneid;
92 
93 	(void) mi_mpprintf(mp,
94 	    "IRE      " MI_COL_HDRPAD_STR
95 	    "rfq      " MI_COL_HDRPAD_STR
96 	    "stq      " MI_COL_HDRPAD_STR
97 	    " zone mxfrg rtt   rtt_sd ssthresh ref "
98 	    "rtomax tstamp_ok wscale_ok ecn_ok pmtud_ok sack sendpipe recvpipe "
99 	    "in/out/forward type    addr         mask         "
100 	    "src             gateway");
101 	/*
102 	 *   01234567 01234567 01234567 12345 12345 12345 12345  12345678 123
103 	 *   123456 123456789 123456789 123456 12345678 1234 12345678 12345678
104 	 *   in/out/forward xxxxxxxxxx
105 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
106 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
107 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
108 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
109 	 */
110 
111 	/*
112 	 * Because of the ndd constraint, at most we can have 64K buffer
113 	 * to put in all IRE info.  So to be more efficient, just
114 	 * allocate a 64K buffer here, assuming we need that large buffer.
115 	 * This should be OK as only root can do ndd /dev/ip.
116 	 */
117 	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
118 		/* The following may work even if we cannot get a large buf. */
119 		(void) mi_mpprintf(mp, "<< Out of buffer >>\n");
120 		return (0);
121 	}
122 	zoneid = Q_TO_CONN(q)->conn_zoneid;
123 	if (zoneid == GLOBAL_ZONEID)
124 		zoneid = ALL_ZONES;
125 
126 	ire_walk_v6(ire_report_ftable_v6, (char *)mp->b_cont, zoneid);
127 	ire_walk_v6(ire_report_ctable_v6, (char *)mp->b_cont, zoneid);
128 	return (0);
129 }
130 
131 /*
132  * ire_walk routine invoked for ip_ire_report_v6 for each IRE.
133  */
134 static void
135 ire_report_ftable_v6(ire_t *ire, char *mp)
136 {
137 	char	buf1[INET6_ADDRSTRLEN];
138 	char	buf2[INET6_ADDRSTRLEN];
139 	char	buf3[INET6_ADDRSTRLEN];
140 	char	buf4[INET6_ADDRSTRLEN];
141 	uint_t	fo_pkt_count;
142 	uint_t	ib_pkt_count;
143 	int	ref;
144 	in6_addr_t gw_addr_v6;
145 	uint_t	print_len, buf_len;
146 
147 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
148 	if (ire->ire_type & IRE_CACHETABLE)
149 	    return;
150 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
151 	if (buf_len <= 0)
152 		return;
153 
154 	/* Number of active references of this ire */
155 	ref = ire->ire_refcnt;
156 	/* "inbound" to a non local address is a forward */
157 	ib_pkt_count = ire->ire_ib_pkt_count;
158 	fo_pkt_count = 0;
159 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
160 	if (!(ire->ire_type & (IRE_LOCAL|IRE_BROADCAST))) {
161 		fo_pkt_count = ib_pkt_count;
162 		ib_pkt_count = 0;
163 	}
164 
165 	mutex_enter(&ire->ire_lock);
166 	gw_addr_v6 = ire->ire_gateway_addr_v6;
167 	mutex_exit(&ire->ire_lock);
168 
169 	print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
170 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
171 	    "%05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
172 	    "%04d %08d %08d %d/%d/%d %s\n\t%s\n\t%s\n\t%s\n\t%s\n",
173 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
174 	    (int)ire->ire_zoneid,
175 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
176 	    ire->ire_uinfo.iulp_rtt_sd,
177 	    ire->ire_uinfo.iulp_ssthresh, ref,
178 	    ire->ire_uinfo.iulp_rtomax,
179 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
180 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
181 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
182 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
183 	    ire->ire_uinfo.iulp_sack,
184 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
185 	    ib_pkt_count, ire->ire_ob_pkt_count, fo_pkt_count,
186 	    ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type),
187 	    inet_ntop(AF_INET6, &ire->ire_addr_v6, buf1, sizeof (buf1)),
188 	    inet_ntop(AF_INET6, &ire->ire_mask_v6, buf2, sizeof (buf2)),
189 	    inet_ntop(AF_INET6, &ire->ire_src_addr_v6, buf3, sizeof (buf3)),
190 	    inet_ntop(AF_INET6, &gw_addr_v6, buf4, sizeof (buf4)));
191 	if (print_len < buf_len) {
192 		((mblk_t *)mp)->b_wptr += print_len;
193 	} else {
194 		((mblk_t *)mp)->b_wptr += buf_len;
195 	}
196 }
197 
198 /* ire_walk routine invoked for ip_ire_report_v6 for each IRE. */
199 static void
200 ire_report_ctable_v6(ire_t *ire, char *mp)
201 {
202 	char	buf1[INET6_ADDRSTRLEN];
203 	char	buf2[INET6_ADDRSTRLEN];
204 	char	buf3[INET6_ADDRSTRLEN];
205 	char	buf4[INET6_ADDRSTRLEN];
206 	uint_t	fo_pkt_count;
207 	uint_t	ib_pkt_count;
208 	int	ref;
209 	in6_addr_t gw_addr_v6;
210 	uint_t	print_len, buf_len;
211 
212 	if ((ire->ire_type & IRE_CACHETABLE) == 0)
213 		return;
214 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
215 	if (buf_len <= 0)
216 		return;
217 
218 	/* Number of active references of this ire */
219 	ref = ire->ire_refcnt;
220 	/* "inbound" to a non local address is a forward */
221 	ib_pkt_count = ire->ire_ib_pkt_count;
222 	fo_pkt_count = 0;
223 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
224 	if (ire->ire_type & IRE_LOCAL) {
225 		fo_pkt_count = ib_pkt_count;
226 		ib_pkt_count = 0;
227 	}
228 
229 	mutex_enter(&ire->ire_lock);
230 	gw_addr_v6 = ire->ire_gateway_addr_v6;
231 	mutex_exit(&ire->ire_lock);
232 
233 	print_len =  snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
234 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
235 	    "%05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
236 	    "%04d %08d %08d %d/%d/%d %s\n\t%s\n\t%s\n\t%s\n\t%s\n",
237 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
238 	    (int)ire->ire_zoneid,
239 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
240 	    ire->ire_uinfo.iulp_rtt_sd, ire->ire_uinfo.iulp_ssthresh, ref,
241 	    ire->ire_uinfo.iulp_rtomax,
242 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
243 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
244 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
245 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
246 	    ire->ire_uinfo.iulp_sack,
247 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
248 	    ib_pkt_count, ire->ire_ob_pkt_count,
249 	    fo_pkt_count, ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type),
250 	    inet_ntop(AF_INET6, &ire->ire_addr_v6, buf1, sizeof (buf1)),
251 	    inet_ntop(AF_INET6, &ire->ire_mask_v6, buf2, sizeof (buf2)),
252 	    inet_ntop(AF_INET6, &ire->ire_src_addr_v6, buf3, sizeof (buf3)),
253 	    inet_ntop(AF_INET6, &gw_addr_v6, buf4, sizeof (buf4)));
254 	if (print_len < buf_len) {
255 		((mblk_t *)mp)->b_wptr += print_len;
256 	} else {
257 		((mblk_t *)mp)->b_wptr += buf_len;
258 	}
259 }
260 
261 
262 /*
263  * Initialize the ire that is specific to IPv6 part and call
264  * ire_init_common to finish it.
265  */
266 ire_t *
267 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr,
268     const in6_addr_t *v6mask, const in6_addr_t *v6src_addr,
269     const in6_addr_t *v6gateway, uint_t *max_fragp,
270     mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
271     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
272     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
273     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp)
274 {
275 	/*
276 	 * Reject IRE security attribute creation/initialization
277 	 * if system is not running in Trusted mode.
278 	 */
279 	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
280 		return (NULL);
281 
282 	if (fp_mp != NULL) {
283 		/*
284 		 * We can't dupb() here as multiple threads could be
285 		 * calling dupb on the same mp which is incorrect.
286 		 * First dupb() should be called only by one thread.
287 		 */
288 		fp_mp = copyb(fp_mp);
289 		if (fp_mp == NULL)
290 			return (NULL);
291 	}
292 
293 	if (dlureq_mp != NULL) {
294 		/*
295 		 * We can't dupb() here as multiple threads could be
296 		 * calling dupb on the same mp which is incorrect.
297 		 * First dupb() should be called only by one thread.
298 		 */
299 		dlureq_mp = copyb(dlureq_mp);
300 		if (dlureq_mp == NULL) {
301 			if (fp_mp != NULL)
302 				freeb(fp_mp);
303 			return (NULL);
304 		}
305 	}
306 
307 	BUMP_IRE_STATS(ire_stats_v6, ire_stats_alloced);
308 	ire->ire_addr_v6 = *v6addr;
309 
310 	if (v6src_addr != NULL)
311 		ire->ire_src_addr_v6 = *v6src_addr;
312 	if (v6mask != NULL) {
313 		ire->ire_mask_v6 = *v6mask;
314 		ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
315 	}
316 	if (v6gateway != NULL)
317 		ire->ire_gateway_addr_v6 = *v6gateway;
318 
319 	if (type == IRE_CACHE && v6cmask != NULL)
320 		ire->ire_cmask_v6 = *v6cmask;
321 
322 	/*
323 	 * Multirouted packets need to have a fragment header added so that
324 	 * the receiver is able to discard duplicates according to their
325 	 * fragment identifier.
326 	 */
327 	if (type == IRE_CACHE && (flags & RTF_MULTIRT)) {
328 		ire->ire_frag_flag = IPH_FRAG_HDR;
329 	}
330 
331 	/* ire_init_common will free the mblks upon encountering any failure */
332 	if (!ire_init_common(ire, max_fragp, fp_mp, rfq, stq, type, dlureq_mp,
333 	    ipif, NULL, phandle, ihandle, flags, IPV6_VERSION, ulp_info,
334 	    gc, gcgrp))
335 		return (NULL);
336 
337 	return (ire);
338 }
339 
340 /*
341  * Similar to ire_create_v6 except that it is called only when
342  * we want to allocate ire as an mblk e.g. we have a external
343  * resolver. Do we need this in IPv6 ?
344  */
345 ire_t *
346 ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
347     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
348     mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
349     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
350     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
351     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp)
352 {
353 	ire_t	*ire;
354 	ire_t	*ret_ire;
355 	mblk_t	*mp;
356 
357 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
358 
359 	/* Allocate the new IRE. */
360 	mp = allocb(sizeof (ire_t), BPRI_MED);
361 	if (mp == NULL) {
362 		ip1dbg(("ire_create_mp_v6: alloc failed\n"));
363 		return (NULL);
364 	}
365 
366 	ire = (ire_t *)mp->b_rptr;
367 	mp->b_wptr = (uchar_t *)&ire[1];
368 
369 	/* Start clean. */
370 	*ire = ire_null;
371 	ire->ire_mp = mp;
372 	mp->b_datap->db_type = IRE_DB_TYPE;
373 
374 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
375 	    NULL, fp_mp, rfq, stq, type, dlureq_mp, ipif, v6cmask, phandle,
376 	    ihandle, flags, ulp_info, gc, gcgrp);
377 
378 	if (ret_ire == NULL) {
379 		freeb(ire->ire_mp);
380 		return (NULL);
381 	}
382 	return (ire);
383 }
384 
385 /*
386  * ire_create_v6 is called to allocate and initialize a new IRE.
387  *
388  * NOTE : This is called as writer sometimes though not required
389  * by this function.
390  */
391 ire_t *
392 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
393     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
394     uint_t *max_fragp, mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
395     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
396     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
397     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp)
398 {
399 	ire_t	*ire;
400 	ire_t	*ret_ire;
401 
402 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
403 
404 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
405 	if (ire == NULL) {
406 		ip1dbg(("ire_create_v6: alloc failed\n"));
407 		return (NULL);
408 	}
409 	*ire = ire_null;
410 
411 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
412 	    max_fragp, fp_mp, rfq, stq, type, dlureq_mp, ipif, v6cmask, phandle,
413 	    ihandle, flags, ulp_info, gc, gcgrp);
414 
415 	if (ret_ire == NULL) {
416 		kmem_cache_free(ire_cache, ire);
417 		return (NULL);
418 	}
419 	ASSERT(ret_ire == ire);
420 	return (ire);
421 }
422 
423 /*
424  * Find an IRE_INTERFACE for the multicast group.
425  * Allows different routes for multicast addresses
426  * in the unicast routing table (akin to FF::0/8 but could be more specific)
427  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
428  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
429  * specify the interface to join on.
430  *
431  * Supports link-local addresses by following the ipif/ill when recursing.
432  */
433 ire_t *
434 ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid)
435 {
436 	ire_t	*ire;
437 	ipif_t	*ipif = NULL;
438 	int	match_flags = MATCH_IRE_TYPE;
439 	in6_addr_t gw_addr_v6;
440 
441 	ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL,
442 	    zoneid, 0, NULL, MATCH_IRE_DEFAULT);
443 
444 	/* We search a resolvable ire in case of multirouting. */
445 	if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
446 		ire_t *cire = NULL;
447 		/*
448 		 * If the route is not resolvable, the looked up ire
449 		 * may be changed here. In that case, ire_multirt_lookup()
450 		 * IRE_REFRELE the original ire and change it.
451 		 */
452 		(void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW,
453 		    NULL);
454 		if (cire != NULL)
455 			ire_refrele(cire);
456 	}
457 	if (ire == NULL)
458 		return (NULL);
459 	/*
460 	 * Make sure we follow ire_ipif.
461 	 *
462 	 * We need to determine the interface route through
463 	 * which the gateway will be reached. We don't really
464 	 * care which interface is picked if the interface is
465 	 * part of a group.
466 	 */
467 	if (ire->ire_ipif != NULL) {
468 		ipif = ire->ire_ipif;
469 		match_flags |= MATCH_IRE_ILL_GROUP;
470 	}
471 
472 	switch (ire->ire_type) {
473 	case IRE_DEFAULT:
474 	case IRE_PREFIX:
475 	case IRE_HOST:
476 		mutex_enter(&ire->ire_lock);
477 		gw_addr_v6 = ire->ire_gateway_addr_v6;
478 		mutex_exit(&ire->ire_lock);
479 		ire_refrele(ire);
480 		ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0,
481 		    IRE_INTERFACE, ipif, NULL, zoneid, 0,
482 		    NULL, match_flags);
483 		return (ire);
484 	case IRE_IF_NORESOLVER:
485 	case IRE_IF_RESOLVER:
486 		return (ire);
487 	default:
488 		ire_refrele(ire);
489 		return (NULL);
490 	}
491 }
492 
493 /*
494  * Return any local address.  We use this to target ourselves
495  * when the src address was specified as 'default'.
496  * Preference for IRE_LOCAL entries.
497  */
498 ire_t *
499 ire_lookup_local_v6(zoneid_t zoneid)
500 {
501 	ire_t	*ire;
502 	irb_t	*irb;
503 	ire_t	*maybe = NULL;
504 	int i;
505 
506 	for (i = 0; i < ip6_cache_table_size;  i++) {
507 		irb = &ip_cache_table_v6[i];
508 		if (irb->irb_ire == NULL)
509 			continue;
510 		rw_enter(&irb->irb_lock, RW_READER);
511 		for (ire = irb->irb_ire; ire; ire = ire->ire_next) {
512 			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
513 			    ire->ire_zoneid != zoneid &&
514 			    ire->ire_zoneid != ALL_ZONES)
515 				continue;
516 			switch (ire->ire_type) {
517 			case IRE_LOOPBACK:
518 				if (maybe == NULL) {
519 					IRE_REFHOLD(ire);
520 					maybe = ire;
521 				}
522 				break;
523 			case IRE_LOCAL:
524 				if (maybe != NULL) {
525 					ire_refrele(maybe);
526 				}
527 				IRE_REFHOLD(ire);
528 				rw_exit(&irb->irb_lock);
529 				return (ire);
530 			}
531 		}
532 		rw_exit(&irb->irb_lock);
533 	}
534 	return (maybe);
535 }
536 
537 /*
538  * This function takes a mask and returns number of bits set in the
539  * mask (the represented prefix length).  Assumes a contiguous mask.
540  */
541 int
542 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
543 {
544 	int		bits;
545 	int		plen = IPV6_ABITS;
546 	int		i;
547 
548 	for (i = 3; i >= 0; i--) {
549 		if (v6mask->s6_addr32[i] == 0) {
550 			plen -= 32;
551 			continue;
552 		}
553 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
554 		if (bits == 0)
555 			break;
556 		plen -= bits;
557 	}
558 
559 	return (plen);
560 }
561 
562 /*
563  * Convert a prefix length to the mask for that prefix.
564  * Returns the argument bitmask.
565  */
566 in6_addr_t *
567 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
568 {
569 	uint32_t *ptr;
570 
571 	if (plen < 0 || plen > IPV6_ABITS)
572 		return (NULL);
573 	*bitmask = ipv6_all_zeros;
574 
575 	ptr = (uint32_t *)bitmask;
576 	while (plen > 32) {
577 		*ptr++ = 0xffffffffU;
578 		plen -= 32;
579 	}
580 	*ptr = htonl(0xffffffffU << (32 - plen));
581 	return (bitmask);
582 }
583 
584 /*
585  * Add a fully initialized IRE to an appropriate
586  * table based on ire_type.
587  *
588  * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST_REDIRECT
589  * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
590  *
591  * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
592  * and IRE_CACHE.
593  *
594  * NOTE : This function is called as writer though not required
595  * by this function.
596  */
597 int
598 ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
599 {
600 	ire_t	*ire1;
601 	int	mask_table_index;
602 	irb_t	*irb_ptr;
603 	ire_t	**irep;
604 	int	flags;
605 	ire_t	*pire = NULL;
606 	ill_t	*stq_ill;
607 	boolean_t	ndp_g_lock_held = B_FALSE;
608 	ire_t	*ire = *ire_p;
609 	int	error;
610 
611 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
612 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
613 	ASSERT(ire->ire_nce == NULL);
614 
615 	/* Find the appropriate list head. */
616 	switch (ire->ire_type) {
617 	case IRE_HOST:
618 		ire->ire_mask_v6 = ipv6_all_ones;
619 		ire->ire_masklen = IPV6_ABITS;
620 		if ((ire->ire_flags & RTF_SETSRC) == 0)
621 			ire->ire_src_addr_v6 = ipv6_all_zeros;
622 		break;
623 	case IRE_HOST_REDIRECT:
624 		ire->ire_mask_v6 = ipv6_all_ones;
625 		ire->ire_masklen = IPV6_ABITS;
626 		ire->ire_src_addr_v6 = ipv6_all_zeros;
627 		break;
628 	case IRE_CACHE:
629 	case IRE_LOCAL:
630 	case IRE_LOOPBACK:
631 		ire->ire_mask_v6 = ipv6_all_ones;
632 		ire->ire_masklen = IPV6_ABITS;
633 		break;
634 	case IRE_PREFIX:
635 		if ((ire->ire_flags & RTF_SETSRC) == 0)
636 			ire->ire_src_addr_v6 = ipv6_all_zeros;
637 		break;
638 	case IRE_DEFAULT:
639 		if ((ire->ire_flags & RTF_SETSRC) == 0)
640 			ire->ire_src_addr_v6 = ipv6_all_zeros;
641 		break;
642 	case IRE_IF_RESOLVER:
643 	case IRE_IF_NORESOLVER:
644 		break;
645 	default:
646 		printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
647 		    (void *)ire, ire->ire_type);
648 		ire_delete(ire);
649 		*ire_p = NULL;
650 		return (EINVAL);
651 	}
652 
653 	/* Make sure the address is properly masked. */
654 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
655 
656 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
657 		/* IRE goes into Forward Table */
658 		mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
659 		if ((ip_forwarding_table_v6[mask_table_index]) == NULL) {
660 			irb_t *ptr;
661 			int i;
662 
663 			ptr = (irb_t *)mi_zalloc((ip6_ftable_hash_size *
664 			    sizeof (irb_t)));
665 			if (ptr == NULL) {
666 				ire_delete(ire);
667 				*ire_p = NULL;
668 				return (ENOMEM);
669 			}
670 			for (i = 0; i < ip6_ftable_hash_size; i++) {
671 				rw_init(&ptr[i].irb_lock, NULL,
672 				    RW_DEFAULT, NULL);
673 			}
674 			mutex_enter(&ire_ft_init_lock);
675 			if (ip_forwarding_table_v6[mask_table_index] == NULL) {
676 				ip_forwarding_table_v6[mask_table_index] = ptr;
677 				mutex_exit(&ire_ft_init_lock);
678 			} else {
679 				/*
680 				 * Some other thread won the race in
681 				 * initializing the forwarding table at the
682 				 * same index.
683 				 */
684 				mutex_exit(&ire_ft_init_lock);
685 				for (i = 0; i < ip6_ftable_hash_size; i++) {
686 					rw_destroy(&ptr[i].irb_lock);
687 				}
688 				mi_free(ptr);
689 			}
690 		}
691 		irb_ptr = &(ip_forwarding_table_v6[mask_table_index][
692 		    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
693 		    ip6_ftable_hash_size)]);
694 	} else {
695 		irb_ptr = &(ip_cache_table_v6[IRE_ADDR_HASH_V6(
696 		    ire->ire_addr_v6, ip6_cache_table_size)]);
697 	}
698 	/*
699 	 * For xresolv interfaces (v6 interfaces with an external
700 	 * address resolver), ip_newroute_v6/ip_newroute_ipif_v6
701 	 * are unable to prevent the deletion of the interface route
702 	 * while adding an IRE_CACHE for an on-link destination
703 	 * in the IRE_IF_RESOLVER case, since the ire has to go to
704 	 * the external resolver and return. We can't do a REFHOLD on the
705 	 * associated interface ire for fear of the message being freed
706 	 * if the external resolver can't resolve the address.
707 	 * Here we look up the interface ire in the forwarding table
708 	 * and make sure that the interface route has not been deleted.
709 	 */
710 	if (ire->ire_type == IRE_CACHE &&
711 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) &&
712 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) &&
713 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) {
714 
715 		pire = ire_ihandle_lookup_onlink_v6(ire);
716 		if (pire == NULL) {
717 			ire_delete(ire);
718 			*ire_p = NULL;
719 			return (EINVAL);
720 		}
721 		/* Prevent pire from getting deleted */
722 		IRB_REFHOLD(pire->ire_bucket);
723 		/* Has it been removed already? */
724 		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
725 			IRB_REFRELE(pire->ire_bucket);
726 			ire_refrele(pire);
727 			ire_delete(ire);
728 			*ire_p = NULL;
729 			return (EINVAL);
730 		}
731 	}
732 
733 	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
734 	/*
735 	 * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check
736 	 * for duplicates because :
737 	 *
738 	 * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be
739 	 *    pointing at different ills. A real duplicate is
740 	 *    a match on both ire_ipif and ire_stq.
741 	 *
742 	 * 2) We could have multiple packets trying to create
743 	 *    an IRE_CACHE for the same ill.
744 	 *
745 	 * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants
746 	 * to go out on a particular ill. Rather than looking at the
747 	 * packet, we depend on the above for MATCH_IRE_ILL here.
748 	 *
749 	 * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
750 	 * multiple IRE_CACHES for an ill for the same destination
751 	 * with various scoped addresses i.e represented by ipifs.
752 	 *
753 	 * MATCH_IRE_ILL is done implicitly below for IRE_CACHES.
754 	 */
755 	if (ire->ire_ipif != NULL)
756 		flags |= MATCH_IRE_IPIF;
757 	/*
758 	 * If we are creating hidden ires, make sure we search on
759 	 * this ill (MATCH_IRE_ILL) and a hidden ire, while we are
760 	 * searching for duplicates below. Otherwise we could
761 	 * potentially find an IRE on some other interface
762 	 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
763 	 * shouldn't do this as this will lead to an infinite loop as
764 	 * eventually we need an hidden ire for this packet to go
765 	 * out. MATCH_IRE_ILL is already marked above.
766 	 */
767 	if (ire->ire_marks & IRE_MARK_HIDDEN) {
768 		ASSERT(ire->ire_type == IRE_CACHE);
769 		flags |= MATCH_IRE_MARK_HIDDEN;
770 	}
771 
772 	/*
773 	 * Start the atomic add of the ire. Grab the ill locks,
774 	 * ill_g_usesrc_lock and the bucket lock. Check for condemned.
775 	 * To avoid lock order problems, get the ndp_g_lock now itself.
776 	 */
777 	if (ire->ire_type == IRE_CACHE) {
778 		mutex_enter(&ndp_g_lock);
779 		ndp_g_lock_held = B_TRUE;
780 	}
781 
782 	/*
783 	 * If ipif or ill is changing ire_atomic_start() may queue the
784 	 * request and return EINPROGRESS.
785 	 */
786 
787 	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
788 	if (error != 0) {
789 		if (ndp_g_lock_held)
790 			mutex_exit(&ndp_g_lock);
791 		/*
792 		 * We don't know whether it is a valid ipif or not.
793 		 * So, set it to NULL. This assumes that the ire has not added
794 		 * a reference to the ipif.
795 		 */
796 		ire->ire_ipif = NULL;
797 		ire_delete(ire);
798 		if (pire != NULL) {
799 			IRB_REFRELE(pire->ire_bucket);
800 			ire_refrele(pire);
801 		}
802 		*ire_p = NULL;
803 		return (error);
804 	}
805 	/*
806 	 * To avoid creating ires having stale values for the ire_max_frag
807 	 * we get the latest value atomically here. For more details
808 	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
809 	 * in ip_rput_dlpi_writer
810 	 */
811 	if (ire->ire_max_fragp == NULL) {
812 		if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6))
813 			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
814 		else
815 			ire->ire_max_frag = pire->ire_max_frag;
816 	} else {
817 		uint_t  max_frag;
818 
819 		max_frag = *ire->ire_max_fragp;
820 		ire->ire_max_fragp = NULL;
821 		ire->ire_max_frag = max_frag;
822 	}
823 
824 	/*
825 	 * Atomically check for duplicate and insert in the table.
826 	 */
827 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
828 		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
829 			continue;
830 
831 		if (ire->ire_type == IRE_CACHE) {
832 			/*
833 			 * We do MATCH_IRE_ILL implicitly here for IRE_CACHES.
834 			 * As ire_ipif and ire_stq could point to two
835 			 * different ills, we can't pass just ire_ipif to
836 			 * ire_match_args and get a match on both ills.
837 			 * This is just needed for duplicate checks here and
838 			 * so we don't add an extra argument to
839 			 * ire_match_args for this. Do it locally.
840 			 *
841 			 * NOTE : Currently there is no part of the code
842 			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
843 			 * match for IRE_CACHEs. Thus we don't want to
844 			 * extend the arguments to ire_match_args_v6.
845 			 */
846 			if (ire1->ire_stq != ire->ire_stq)
847 				continue;
848 			/*
849 			 * Multiroute IRE_CACHEs for a given destination can
850 			 * have the same ire_ipif, typically if their source
851 			 * address is forced using RTF_SETSRC, and the same
852 			 * send-to queue. We differentiate them using the parent
853 			 * handle.
854 			 */
855 			if ((ire1->ire_flags & RTF_MULTIRT) &&
856 			    (ire->ire_flags & RTF_MULTIRT) &&
857 			    (ire1->ire_phandle != ire->ire_phandle))
858 				continue;
859 		}
860 		if (ire1->ire_zoneid != ire->ire_zoneid)
861 			continue;
862 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
863 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
864 		    ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL,
865 		    flags)) {
866 			/*
867 			 * Return the old ire after doing a REFHOLD.
868 			 * As most of the callers continue to use the IRE
869 			 * after adding, we return a held ire. This will
870 			 * avoid a lookup in the caller again. If the callers
871 			 * don't want to use it, they need to do a REFRELE.
872 			 */
873 			ip1dbg(("found dup ire existing %p new %p",
874 			    (void *)ire1, (void *)ire));
875 			IRE_REFHOLD(ire1);
876 			if (ndp_g_lock_held)
877 				mutex_exit(&ndp_g_lock);
878 			ire_atomic_end(irb_ptr, ire);
879 			ire_delete(ire);
880 			if (pire != NULL) {
881 				/*
882 				 * Assert that it is
883 				 * not yet removed from the list.
884 				 */
885 				ASSERT(pire->ire_ptpn != NULL);
886 				IRB_REFRELE(pire->ire_bucket);
887 				ire_refrele(pire);
888 			}
889 			*ire_p = ire1;
890 			return (0);
891 		}
892 	}
893 	if (ire->ire_type == IRE_CACHE) {
894 		in6_addr_t gw_addr_v6;
895 		ill_t	*ill = ire_to_ill(ire);
896 		char	buf[INET6_ADDRSTRLEN];
897 		nce_t	*nce;
898 
899 		/*
900 		 * All IRE_CACHE types must have a nce.  If this is
901 		 * not the case the entry will not be added. We need
902 		 * to make sure that if somebody deletes the nce
903 		 * after we looked up, they will find this ire and
904 		 * delete the ire. To delete this ire one needs the
905 		 * bucket lock which we are still holding here. So,
906 		 * even if the nce gets deleted after we looked up,
907 		 * this ire  will get deleted.
908 		 *
909 		 * NOTE : Don't need the ire_lock for accessing
910 		 * ire_gateway_addr_v6 as it is appearing first
911 		 * time on the list and rts_setgwr_v6 could not
912 		 * be changing this.
913 		 */
914 		gw_addr_v6 = ire->ire_gateway_addr_v6;
915 		if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
916 			nce = ndp_lookup(ill, &ire->ire_addr_v6, B_TRUE);
917 		} else {
918 			nce = ndp_lookup(ill, &gw_addr_v6, B_TRUE);
919 		}
920 		if (nce == NULL)
921 			goto failed;
922 
923 		/* Pair of refhold, refrele just to get the tracing right */
924 		NCE_REFHOLD_NOTR(nce);
925 		NCE_REFRELE(nce);
926 		/*
927 		 * Atomically make sure that new IREs don't point
928 		 * to an NCE that is logically deleted (CONDEMNED).
929 		 * ndp_delete() first marks the NCE CONDEMNED.
930 		 * This ensures that the nce_refcnt won't increase
931 		 * due to new nce_lookups or due to addition of new IREs
932 		 * pointing to this NCE. Then ndp_delete() cleans up
933 		 * existing references. If we don't do it atomically here,
934 		 * ndp_delete() -> nce_ire_delete() will not be able to
935 		 * clean up the IRE list completely, and the nce_refcnt
936 		 * won't go down to zero.
937 		 */
938 		mutex_enter(&nce->nce_lock);
939 		if (ill->ill_flags & ILLF_XRESOLV) {
940 			/*
941 			 * If we used an external resolver, we may not
942 			 * have gone through neighbor discovery to get here.
943 			 * Must update the nce_state before the next check.
944 			 */
945 			if (nce->nce_state == ND_INCOMPLETE)
946 				nce->nce_state = ND_REACHABLE;
947 		}
948 		if (nce->nce_state == ND_INCOMPLETE ||
949 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
950 		    (nce->nce_state == ND_UNREACHABLE)) {
951 failed:
952 			if (ndp_g_lock_held)
953 				mutex_exit(&ndp_g_lock);
954 			if (nce != NULL)
955 				mutex_exit(&nce->nce_lock);
956 			ire_atomic_end(irb_ptr, ire);
957 			ip1dbg(("ire_add_v6: No nce for dst %s \n",
958 			    inet_ntop(AF_INET6, &ire->ire_addr_v6,
959 			    buf, sizeof (buf))));
960 			ire_delete(ire);
961 			if (pire != NULL) {
962 				/*
963 				 * Assert that it is
964 				 * not yet removed from the list.
965 				 */
966 				ASSERT(pire->ire_ptpn != NULL);
967 				IRB_REFRELE(pire->ire_bucket);
968 				ire_refrele(pire);
969 			}
970 			if (nce != NULL)
971 				NCE_REFRELE_NOTR(nce);
972 			*ire_p = NULL;
973 			return (EINVAL);
974 		} else {
975 			ire->ire_nce = nce;
976 		}
977 		mutex_exit(&nce->nce_lock);
978 	}
979 	/*
980 	 * Find the first entry that matches ire_addr - provides
981 	 * tail insertion. *irep will be null if no match.
982 	 */
983 	irep = (ire_t **)irb_ptr;
984 	while ((ire1 = *irep) != NULL &&
985 	    !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
986 		irep = &ire1->ire_next;
987 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
988 
989 	if (*irep != NULL) {
990 		/*
991 		 * Find the last ire which matches ire_addr_v6.
992 		 * Needed to do tail insertion among entries with the same
993 		 * ire_addr_v6.
994 		 */
995 		while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
996 		    &ire1->ire_addr_v6)) {
997 			irep = &ire1->ire_next;
998 			ire1 = *irep;
999 			if (ire1 == NULL)
1000 				break;
1001 		}
1002 	}
1003 
1004 	if (ire->ire_type == IRE_DEFAULT) {
1005 		/*
1006 		 * We keep a count of default gateways which is used when
1007 		 * assigning them as routes.
1008 		 */
1009 		ipv6_ire_default_count++;
1010 		ASSERT(ipv6_ire_default_count != 0); /* Wraparound */
1011 	}
1012 	/* Insert at *irep */
1013 	ire1 = *irep;
1014 	if (ire1 != NULL)
1015 		ire1->ire_ptpn = &ire->ire_next;
1016 	ire->ire_next = ire1;
1017 	/* Link the new one in. */
1018 	ire->ire_ptpn = irep;
1019 	/*
1020 	 * ire_walk routines de-reference ire_next without holding
1021 	 * a lock. Before we point to the new ire, we want to make
1022 	 * sure the store that sets the ire_next of the new ire
1023 	 * reaches global visibility, so that ire_walk routines
1024 	 * don't see a truncated list of ires i.e if the ire_next
1025 	 * of the new ire gets set after we do "*irep = ire" due
1026 	 * to re-ordering, the ire_walk thread will see a NULL
1027 	 * once it accesses the ire_next of the new ire.
1028 	 * membar_producer() makes sure that the following store
1029 	 * happens *after* all of the above stores.
1030 	 */
1031 	membar_producer();
1032 	*irep = ire;
1033 	ire->ire_bucket = irb_ptr;
1034 	/*
1035 	 * We return a bumped up IRE above. Keep it symmetrical
1036 	 * so that the callers will always have to release. This
1037 	 * helps the callers of this function because they continue
1038 	 * to use the IRE after adding and hence they don't have to
1039 	 * lookup again after we return the IRE.
1040 	 *
1041 	 * NOTE : We don't have to use atomics as this is appearing
1042 	 * in the list for the first time and no one else can bump
1043 	 * up the reference count on this yet.
1044 	 */
1045 	IRE_REFHOLD_LOCKED(ire);
1046 	BUMP_IRE_STATS(ire_stats_v6, ire_stats_inserted);
1047 	irb_ptr->irb_ire_cnt++;
1048 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
1049 		irb_ptr->irb_tmp_ire_cnt++;
1050 
1051 	if (ire->ire_ipif != NULL) {
1052 		ire->ire_ipif->ipif_ire_cnt++;
1053 		if (ire->ire_stq != NULL) {
1054 			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
1055 			stq_ill->ill_ire_cnt++;
1056 		}
1057 	} else {
1058 		ASSERT(ire->ire_stq == NULL);
1059 	}
1060 
1061 	if (ndp_g_lock_held)
1062 		mutex_exit(&ndp_g_lock);
1063 	ire_atomic_end(irb_ptr, ire);
1064 
1065 	if (pire != NULL) {
1066 		/* Assert that it is not removed from the list yet */
1067 		ASSERT(pire->ire_ptpn != NULL);
1068 		IRB_REFRELE(pire->ire_bucket);
1069 		ire_refrele(pire);
1070 	}
1071 
1072 	if (ire->ire_type != IRE_CACHE) {
1073 		/*
1074 		 * For ire's with with host mask see if there is an entry
1075 		 * in the cache. If there is one flush the whole cache as
1076 		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
1077 		 * If no entry is found than there is no need to flush the
1078 		 * cache.
1079 		 */
1080 
1081 		if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) {
1082 			ire_t *lire;
1083 			lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL,
1084 			    IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE);
1085 			if (lire != NULL) {
1086 				ire_refrele(lire);
1087 				ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
1088 			}
1089 		} else {
1090 			ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
1091 		}
1092 	}
1093 
1094 	*ire_p = ire;
1095 	return (0);
1096 }
1097 
1098 /*
1099  * Search for all HOST REDIRECT routes that are
1100  * pointing at the specified gateway and
1101  * delete them. This routine is called only
1102  * when a default gateway is going away.
1103  */
1104 static void
1105 ire_delete_host_redirects_v6(const in6_addr_t *gateway)
1106 {
1107 	irb_t *irb_ptr;
1108 	irb_t *irb;
1109 	ire_t *ire;
1110 	in6_addr_t gw_addr_v6;
1111 	int i;
1112 
1113 	/* get the hash table for HOST routes */
1114 	irb_ptr = ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
1115 	if (irb_ptr == NULL)
1116 		return;
1117 	for (i = 0; (i < ip6_ftable_hash_size); i++) {
1118 		irb = &irb_ptr[i];
1119 		IRB_REFHOLD(irb);
1120 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
1121 			if (ire->ire_type != IRE_HOST_REDIRECT)
1122 				continue;
1123 			mutex_enter(&ire->ire_lock);
1124 			gw_addr_v6 = ire->ire_gateway_addr_v6;
1125 			mutex_exit(&ire->ire_lock);
1126 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
1127 				ire_delete(ire);
1128 		}
1129 		IRB_REFRELE(irb);
1130 	}
1131 }
1132 
1133 /*
1134  * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart
1135  * of ip_ire_clookup_and_delete. The difference being this function does not
1136  * return any value. IPv6 processing of a gratuitous ARP, as it stands, is
1137  * different than IPv4 in that, regardless of the presence of a cache entry
1138  * for this address, an ire_walk_v6 is done. Another difference is that unlike
1139  * in the case of IPv4 this does not take an ipif_t argument, since it is only
1140  * called by ip_arp_news and the match is always only on the address.
1141  */
1142 void
1143 ip_ire_clookup_and_delete_v6(const in6_addr_t *addr)
1144 {
1145 	irb_t		*irb;
1146 	ire_t		*cire;
1147 	boolean_t	found = B_FALSE;
1148 
1149 	irb = &ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, ip6_cache_table_size)];
1150 	IRB_REFHOLD(irb);
1151 	for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) {
1152 		if (cire->ire_marks == IRE_MARK_CONDEMNED)
1153 			continue;
1154 		if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) {
1155 
1156 			/* This signifies start of a match */
1157 			if (!found)
1158 				found = B_TRUE;
1159 			if (cire->ire_type == IRE_CACHE) {
1160 				if (cire->ire_nce != NULL)
1161 					ndp_delete(cire->ire_nce);
1162 				ire_delete_v6(cire);
1163 			}
1164 		/* End of the match */
1165 		} else if (found)
1166 			break;
1167 	}
1168 	IRB_REFRELE(irb);
1169 }
1170 
1171 /*
1172  * Delete the specified IRE.
1173  * All calls should use ire_delete().
1174  * Sometimes called as writer though not required by this function.
1175  *
1176  * NOTE : This function is called only if the ire was added
1177  * in the list.
1178  */
1179 void
1180 ire_delete_v6(ire_t *ire)
1181 {
1182 	in6_addr_t gw_addr_v6;
1183 
1184 	ASSERT(ire->ire_refcnt >= 1);
1185 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
1186 
1187 	if (ire->ire_type != IRE_CACHE)
1188 		ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
1189 	if (ire->ire_type == IRE_DEFAULT) {
1190 		/*
1191 		 * when a default gateway is going away
1192 		 * delete all the host redirects pointing at that
1193 		 * gateway.
1194 		 */
1195 		mutex_enter(&ire->ire_lock);
1196 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1197 		mutex_exit(&ire->ire_lock);
1198 		ire_delete_host_redirects_v6(&gw_addr_v6);
1199 	}
1200 }
1201 
1202 /*
1203  * ire_walk routine to delete all IRE_CACHE and IRE_HOST_REDIRECT
1204  * entries.
1205  */
1206 /*ARGSUSED1*/
1207 void
1208 ire_delete_cache_v6(ire_t *ire, char *arg)
1209 {
1210 	char    addrstr1[INET6_ADDRSTRLEN];
1211 	char    addrstr2[INET6_ADDRSTRLEN];
1212 
1213 	if (ire->ire_type & (IRE_CACHE | IRE_HOST_REDIRECT)) {
1214 		ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n",
1215 		    inet_ntop(AF_INET6, &ire->ire_addr_v6,
1216 			addrstr1, sizeof (addrstr1)),
1217 		    ire->ire_type,
1218 		    inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6,
1219 			addrstr2, sizeof (addrstr2))));
1220 		ire_delete(ire);
1221 	}
1222 
1223 }
1224 
1225 /*
1226  * ire_walk routine to delete all IRE_CACHE/IRE_HOST_REDIRECT entries
1227  * that have a given gateway address.
1228  */
1229 void
1230 ire_delete_cache_gw_v6(ire_t *ire, char *addr)
1231 {
1232 	in6_addr_t	*gw_addr = (in6_addr_t *)addr;
1233 	char		buf1[INET6_ADDRSTRLEN];
1234 	char		buf2[INET6_ADDRSTRLEN];
1235 	in6_addr_t	ire_gw_addr_v6;
1236 
1237 	if (!(ire->ire_type & (IRE_CACHE|IRE_HOST_REDIRECT)))
1238 		return;
1239 
1240 	mutex_enter(&ire->ire_lock);
1241 	ire_gw_addr_v6 = ire->ire_gateway_addr_v6;
1242 	mutex_exit(&ire->ire_lock);
1243 
1244 	if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) {
1245 		ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n",
1246 		    inet_ntop(AF_INET6, &ire->ire_src_addr_v6,
1247 		    buf1, sizeof (buf1)),
1248 		    ire->ire_type,
1249 		    inet_ntop(AF_INET6, &ire_gw_addr_v6,
1250 		    buf2, sizeof (buf2))));
1251 		ire_delete(ire);
1252 	}
1253 }
1254 
1255 /*
1256  * Remove all IRE_CACHE entries that match
1257  * the ire specified.  (Sometimes called
1258  * as writer though not required by this function.)
1259  *
1260  * The flag argument indicates if the
1261  * flush request is due to addition
1262  * of new route (IRE_FLUSH_ADD) or deletion of old
1263  * route (IRE_FLUSH_DELETE).
1264  *
1265  * This routine takes only the IREs from the forwarding
1266  * table and flushes the corresponding entries from
1267  * the cache table.
1268  *
1269  * When flushing due to the deletion of an old route, it
1270  * just checks the cache handles (ire_phandle and ire_ihandle) and
1271  * deletes the ones that match.
1272  *
1273  * When flushing due to the creation of a new route, it checks
1274  * if a cache entry's address matches the one in the IRE and
1275  * that the cache entry's parent has a less specific mask than the
1276  * one in IRE. The destination of such a cache entry could be the
1277  * gateway for other cache entries, so we need to flush those as
1278  * well by looking for gateway addresses matching the IRE's address.
1279  */
1280 void
1281 ire_flush_cache_v6(ire_t *ire, int flag)
1282 {
1283 	int i;
1284 	ire_t *cire;
1285 	irb_t *irb;
1286 
1287 	if (ire->ire_type & IRE_CACHE)
1288 	    return;
1289 
1290 	/*
1291 	 * If a default is just created, there is no point
1292 	 * in going through the cache, as there will not be any
1293 	 * cached ires.
1294 	 */
1295 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
1296 		return;
1297 	if (flag == IRE_FLUSH_ADD) {
1298 		/*
1299 		 * This selective flush is
1300 		 * due to the addition of
1301 		 * new IRE.
1302 		 */
1303 		for (i = 0; i < ip6_cache_table_size; i++) {
1304 			irb = &ip_cache_table_v6[i];
1305 			if ((cire = irb->irb_ire) == NULL)
1306 				continue;
1307 			IRB_REFHOLD(irb);
1308 			for (cire = irb->irb_ire; cire != NULL;
1309 			    cire = cire->ire_next) {
1310 				if (cire->ire_type != IRE_CACHE)
1311 					continue;
1312 				/*
1313 				 * If 'cire' belongs to the same subnet
1314 				 * as the new ire being added, and 'cire'
1315 				 * is derived from a prefix that is less
1316 				 * specific than the new ire being added,
1317 				 * we need to flush 'cire'; for instance,
1318 				 * when a new interface comes up.
1319 				 */
1320 				if ((V6_MASK_EQ_2(cire->ire_addr_v6,
1321 				    ire->ire_mask_v6, ire->ire_addr_v6) &&
1322 				    (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <=
1323 				    ire->ire_masklen))) {
1324 					ire_delete(cire);
1325 					continue;
1326 				}
1327 				/*
1328 				 * This is the case when the ire_gateway_addr
1329 				 * of 'cire' belongs to the same subnet as
1330 				 * the new ire being added.
1331 				 * Flushing such ires is sometimes required to
1332 				 * avoid misrouting: say we have a machine with
1333 				 * two interfaces (I1 and I2), a default router
1334 				 * R on the I1 subnet, and a host route to an
1335 				 * off-link destination D with a gateway G on
1336 				 * the I2 subnet.
1337 				 * Under normal operation, we will have an
1338 				 * on-link cache entry for G and an off-link
1339 				 * cache entry for D with G as ire_gateway_addr,
1340 				 * traffic to D will reach its destination
1341 				 * through gateway G.
1342 				 * If the administrator does 'ifconfig I2 down',
1343 				 * the cache entries for D and G will be
1344 				 * flushed. However, G will now be resolved as
1345 				 * an off-link destination using R (the default
1346 				 * router) as gateway. Then D will also be
1347 				 * resolved as an off-link destination using G
1348 				 * as gateway - this behavior is due to
1349 				 * compatibility reasons, see comment in
1350 				 * ire_ihandle_lookup_offlink(). Traffic to D
1351 				 * will go to the router R and probably won't
1352 				 * reach the destination.
1353 				 * The administrator then does 'ifconfig I2 up'.
1354 				 * Since G is on the I2 subnet, this routine
1355 				 * will flush its cache entry. It must also
1356 				 * flush the cache entry for D, otherwise
1357 				 * traffic will stay misrouted until the IRE
1358 				 * times out.
1359 				 */
1360 				if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6,
1361 				    ire->ire_mask_v6, ire->ire_addr_v6)) {
1362 					ire_delete(cire);
1363 					continue;
1364 				}
1365 			}
1366 			IRB_REFRELE(irb);
1367 		}
1368 	} else {
1369 		/*
1370 		 * delete the cache entries based on
1371 		 * handle in the IRE as this IRE is
1372 		 * being deleted/changed.
1373 		 */
1374 		for (i = 0; i < ip6_cache_table_size; i++) {
1375 			irb = &ip_cache_table_v6[i];
1376 			if ((cire = irb->irb_ire) == NULL)
1377 				continue;
1378 			IRB_REFHOLD(irb);
1379 			for (cire = irb->irb_ire; cire != NULL;
1380 			    cire = cire->ire_next) {
1381 				if (cire->ire_type != IRE_CACHE)
1382 					continue;
1383 				if ((cire->ire_phandle == 0 ||
1384 				    cire->ire_phandle != ire->ire_phandle) &&
1385 				    (cire->ire_ihandle == 0 ||
1386 				    cire->ire_ihandle != ire->ire_ihandle))
1387 					continue;
1388 				ire_delete(cire);
1389 			}
1390 			IRB_REFRELE(irb);
1391 		}
1392 	}
1393 }
1394 
1395 /*
1396  * Matches the arguments passed with the values in the ire.
1397  *
1398  * Note: for match types that match using "ipif" passed in, ipif
1399  * must be checked for non-NULL before calling this routine.
1400  */
1401 static boolean_t
1402 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
1403     const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid,
1404     uint32_t ihandle, const ts_label_t *tsl, int match_flags)
1405 {
1406 	in6_addr_t masked_addr;
1407 	in6_addr_t gw_addr_v6;
1408 	ill_t *ire_ill = NULL, *dst_ill;
1409 	ill_t *ipif_ill = NULL;
1410 	ill_group_t *ire_ill_group = NULL;
1411 	ill_group_t *ipif_ill_group = NULL;
1412 	ipif_t	*src_ipif;
1413 
1414 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
1415 	ASSERT(addr != NULL);
1416 	ASSERT(mask != NULL);
1417 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
1418 	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
1419 	    (ipif != NULL && ipif->ipif_isv6));
1420 	ASSERT(!(match_flags & MATCH_IRE_WQ));
1421 
1422 	/*
1423 	 * HIDDEN cache entries have to be looked up specifically with
1424 	 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
1425 	 * when the interface is FAILED or INACTIVE. In that case,
1426 	 * any IRE_CACHES that exists should be marked with
1427 	 * IRE_MARK_HIDDEN. So, we don't really need to match below
1428 	 * for IRE_MARK_HIDDEN. But we do so for consistency.
1429 	 */
1430 	if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
1431 	    (ire->ire_marks & IRE_MARK_HIDDEN))
1432 		return (B_FALSE);
1433 
1434 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
1435 	    ire->ire_zoneid != ALL_ZONES) {
1436 		/*
1437 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
1438 		 * valid and does not match that of ire_zoneid, a failure to
1439 		 * match is reported at this point. Otherwise, since some IREs
1440 		 * that are available in the global zone can be used in local
1441 		 * zones, additional checks need to be performed:
1442 		 *
1443 		 *	IRE_CACHE and IRE_LOOPBACK entries should
1444 		 *	never be matched in this situation.
1445 		 *
1446 		 *	IRE entries that have an interface associated with them
1447 		 *	should in general not match unless they are an IRE_LOCAL
1448 		 *	or in the case when MATCH_IRE_DEFAULT has been set in
1449 		 *	the caller.  In the case of the former, checking of the
1450 		 *	other fields supplied should take place.
1451 		 *
1452 		 *	In the case where MATCH_IRE_DEFAULT has been set,
1453 		 *	all of the ipif's associated with the IRE's ill are
1454 		 *	checked to see if there is a matching zoneid.  If any
1455 		 *	one ipif has a matching zoneid, this IRE is a
1456 		 *	potential candidate so checking of the other fields
1457 		 *	takes place.
1458 		 *
1459 		 *	In the case where the IRE_INTERFACE has a usable source
1460 		 *	address (indicated by ill_usesrc_ifindex) in the
1461 		 *	correct zone then it's permitted to return this IRE
1462 		 */
1463 		if (match_flags & MATCH_IRE_ZONEONLY)
1464 			return (B_FALSE);
1465 		if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK))
1466 			return (B_FALSE);
1467 		/*
1468 		 * Note, IRE_INTERFACE can have the stq as NULL. For
1469 		 * example, if the default multicast route is tied to
1470 		 * the loopback address.
1471 		 */
1472 		if ((ire->ire_type & IRE_INTERFACE) &&
1473 		    (ire->ire_stq != NULL)) {
1474 			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
1475 			/*
1476 			 * If there is a usable source address in the
1477 			 * zone, then it's ok to return an
1478 			 * IRE_INTERFACE
1479 			 */
1480 			if ((dst_ill->ill_usesrc_ifindex != 0) &&
1481 			    (src_ipif = ipif_select_source_v6(dst_ill, addr,
1482 			    B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid))
1483 			    != NULL) {
1484 				ip3dbg(("ire_match_args: src_ipif %p"
1485 				    " dst_ill %p", (void *)src_ipif,
1486 				    (void *)dst_ill));
1487 				ipif_refrele(src_ipif);
1488 			} else {
1489 				ip3dbg(("ire_match_args: src_ipif NULL"
1490 				    " dst_ill %p\n", (void *)dst_ill));
1491 				return (B_FALSE);
1492 			}
1493 		}
1494 		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
1495 		    !(ire->ire_type & IRE_INTERFACE)) {
1496 			ipif_t	*tipif;
1497 
1498 			if ((match_flags & MATCH_IRE_DEFAULT) == 0)
1499 				return (B_FALSE);
1500 			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
1501 			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
1502 			    tipif != NULL; tipif = tipif->ipif_next) {
1503 				if (IPIF_CAN_LOOKUP(tipif) &&
1504 				    (tipif->ipif_flags & IPIF_UP) &&
1505 				    (tipif->ipif_zoneid == zoneid ||
1506 				    tipif->ipif_zoneid == ALL_ZONES))
1507 					break;
1508 			}
1509 			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
1510 			if (tipif == NULL)
1511 				return (B_FALSE);
1512 		}
1513 	}
1514 
1515 	if (match_flags & MATCH_IRE_GW) {
1516 		mutex_enter(&ire->ire_lock);
1517 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1518 		mutex_exit(&ire->ire_lock);
1519 	}
1520 	/*
1521 	 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
1522 	 * somebody wants to send out on a particular interface which
1523 	 * is given by ire_stq and hence use ire_stq to derive the ill
1524 	 * value. ire_ipif for IRE_CACHES is just the
1525 	 * means of getting a source address i.e ire_src_addr_v6 =
1526 	 * ire->ire_ipif->ipif_src_addr_v6.
1527 	 */
1528 	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
1529 		ire_ill = ire_to_ill(ire);
1530 		if (ire_ill != NULL)
1531 			ire_ill_group = ire_ill->ill_group;
1532 		ipif_ill = ipif->ipif_ill;
1533 		ipif_ill_group = ipif_ill->ill_group;
1534 	}
1535 
1536 	/* No ire_addr_v6 bits set past the mask */
1537 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
1538 	    ire->ire_addr_v6));
1539 	V6_MASK_COPY(*addr, *mask, masked_addr);
1540 
1541 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
1542 	    ((!(match_flags & MATCH_IRE_GW)) ||
1543 		IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
1544 	    ((!(match_flags & MATCH_IRE_TYPE)) ||
1545 		(ire->ire_type & type)) &&
1546 	    ((!(match_flags & MATCH_IRE_SRC)) ||
1547 		IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
1548 		&ipif->ipif_v6src_addr)) &&
1549 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
1550 		(ire->ire_ipif == ipif)) &&
1551 	    ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
1552 		(ire->ire_type != IRE_CACHE ||
1553 		ire->ire_marks & IRE_MARK_HIDDEN)) &&
1554 	    ((!(match_flags & MATCH_IRE_ILL)) ||
1555 		(ire_ill == ipif_ill)) &&
1556 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
1557 		(ire->ire_ihandle == ihandle)) &&
1558 	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
1559 		(ire_ill == ipif_ill) ||
1560 		(ire_ill_group != NULL &&
1561 		ire_ill_group == ipif_ill_group)) &&
1562 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
1563 		(!is_system_labeled()) ||
1564 		(tsol_ire_match_gwattr(ire, tsl) == 0))) {
1565 		/* We found the matched IRE */
1566 		return (B_TRUE);
1567 	}
1568 	return (B_FALSE);
1569 }
1570 
1571 /*
1572  * Lookup for a route in all the tables
1573  */
1574 ire_t *
1575 ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1576     const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
1577     zoneid_t zoneid, const ts_label_t *tsl, int flags)
1578 {
1579 	ire_t *ire = NULL;
1580 
1581 	/*
1582 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
1583 	 * MATCH_IRE_ILL is set.
1584 	 */
1585 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
1586 	    (ipif == NULL))
1587 		return (NULL);
1588 
1589 	/*
1590 	 * might be asking for a cache lookup,
1591 	 * This is not best way to lookup cache,
1592 	 * user should call ire_cache_lookup directly.
1593 	 *
1594 	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
1595 	 * in the forwarding table, if the applicable type flags were set.
1596 	 */
1597 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
1598 		ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid,
1599 		    tsl, flags);
1600 		if (ire != NULL)
1601 			return (ire);
1602 	}
1603 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
1604 		ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif,
1605 		    pire, zoneid, 0, tsl, flags);
1606 	}
1607 	return (ire);
1608 }
1609 
1610 /*
1611  * Lookup a route in forwarding table.
1612  * specific lookup is indicated by passing the
1613  * required parameters and indicating the
1614  * match required in flag field.
1615  *
1616  * Looking for default route can be done in three ways
1617  * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field
1618  *    along with other matches.
1619  * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
1620  *    field along with other matches.
1621  * 3) if the destination and mask are passed as zeros.
1622  *
1623  * A request to return a default route if no route
1624  * is found, can be specified by setting MATCH_IRE_DEFAULT
1625  * in flags.
1626  *
1627  * It does not support recursion more than one level. It
1628  * will do recursive lookup only when the lookup maps to
1629  * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
1630  *
1631  * If the routing table is setup to allow more than one level
1632  * of recursion, the cleaning up cache table will not work resulting
1633  * in invalid routing.
1634  *
1635  * Supports link-local addresses by following the ipif/ill when recursing.
1636  *
1637  * NOTE : When this function returns NULL, pire has already been released.
1638  *	  pire is valid only when this function successfully returns an
1639  *	  ire.
1640  */
1641 ire_t *
1642 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1643     const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
1644     zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags)
1645 {
1646 	irb_t *irb_ptr;
1647 	ire_t	*rire;
1648 	ire_t *ire = NULL;
1649 	ire_t	*saved_ire;
1650 	nce_t	*nce;
1651 	int i;
1652 	in6_addr_t gw_addr_v6;
1653 
1654 	ASSERT(addr != NULL);
1655 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
1656 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
1657 	ASSERT(ipif == NULL || ipif->ipif_isv6);
1658 	ASSERT(!(flags & MATCH_IRE_WQ));
1659 
1660 	/*
1661 	 * When we return NULL from this function, we should make
1662 	 * sure that *pire is NULL so that the callers will not
1663 	 * wrongly REFRELE the pire.
1664 	 */
1665 	if (pire != NULL)
1666 		*pire = NULL;
1667 	/*
1668 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
1669 	 * MATCH_IRE_ILL is set.
1670 	 */
1671 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
1672 	    (ipif == NULL))
1673 		return (NULL);
1674 
1675 	/*
1676 	 * If the mask is known, the lookup
1677 	 * is simple, if the mask is not known
1678 	 * we need to search.
1679 	 */
1680 	if (flags & MATCH_IRE_MASK) {
1681 		uint_t masklen;
1682 
1683 		masklen = ip_mask_to_plen_v6(mask);
1684 		if (ip_forwarding_table_v6[masklen] == NULL)
1685 			return (NULL);
1686 		irb_ptr = &(ip_forwarding_table_v6[masklen][
1687 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask, ip6_ftable_hash_size)]);
1688 		rw_enter(&irb_ptr->irb_lock, RW_READER);
1689 		for (ire = irb_ptr->irb_ire; ire != NULL;
1690 		    ire = ire->ire_next) {
1691 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
1692 				continue;
1693 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
1694 			    ipif, zoneid, ihandle, tsl, flags))
1695 				goto found_ire;
1696 		}
1697 		rw_exit(&irb_ptr->irb_lock);
1698 	} else {
1699 		/*
1700 		 * In this case we don't know the mask, we need to
1701 		 * search the table assuming different mask sizes.
1702 		 * we start with 128 bit mask, we don't allow default here.
1703 		 */
1704 		for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) {
1705 			in6_addr_t tmpmask;
1706 
1707 			if ((ip_forwarding_table_v6[i]) == NULL)
1708 				continue;
1709 			(void) ip_plen_to_mask_v6(i, &tmpmask);
1710 			irb_ptr = &ip_forwarding_table_v6[i][
1711 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1712 			    ip6_ftable_hash_size)];
1713 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1714 			for (ire = irb_ptr->irb_ire; ire != NULL;
1715 			    ire = ire->ire_next) {
1716 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1717 					continue;
1718 				if (ire_match_args_v6(ire, addr,
1719 				    &ire->ire_mask_v6, gateway, type, ipif,
1720 				    zoneid, ihandle, tsl, flags))
1721 					goto found_ire;
1722 			}
1723 			rw_exit(&irb_ptr->irb_lock);
1724 		}
1725 	}
1726 
1727 	/*
1728 	 * We come here if no route has yet been found.
1729 	 *
1730 	 * Handle the case where default route is
1731 	 * requested by specifying type as one of the possible
1732 	 * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE).
1733 	 *
1734 	 * If MATCH_IRE_MASK is specified, then the appropriate default route
1735 	 * would have been found above if it exists so it isn't looked up here.
1736 	 * If MATCH_IRE_DEFAULT was also specified, then a default route will be
1737 	 * searched for later.
1738 	 */
1739 	if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE &&
1740 	    (type & (IRE_DEFAULT | IRE_INTERFACE))) {
1741 		if (ip_forwarding_table_v6[0] != NULL) {
1742 			/* addr & mask is zero for defaults */
1743 			irb_ptr = &ip_forwarding_table_v6[0][
1744 			    IRE_ADDR_HASH_V6(ipv6_all_zeros,
1745 			    ip6_ftable_hash_size)];
1746 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1747 			for (ire = irb_ptr->irb_ire; ire != NULL;
1748 			    ire = ire->ire_next) {
1749 
1750 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1751 					continue;
1752 
1753 				if (ire_match_args_v6(ire, addr,
1754 				    &ipv6_all_zeros, gateway, type, ipif,
1755 				    zoneid, ihandle, tsl, flags))
1756 					goto found_ire;
1757 			}
1758 			rw_exit(&irb_ptr->irb_lock);
1759 		}
1760 	}
1761 	/*
1762 	 * We come here only if no route is found.
1763 	 * see if the default route can be used which is allowed
1764 	 * only if the default matching criteria is specified.
1765 	 * The ipv6_ire_default_count tracks the number of IRE_DEFAULT
1766 	 * entries. However, the ip_forwarding_table_v6[0] also contains
1767 	 * interface routes thus the count can be zero.
1768 	 */
1769 	saved_ire = NULL;
1770 	if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
1771 	    MATCH_IRE_DEFAULT) {
1772 		ire_t	*ire_origin;
1773 		uint_t	g_index;
1774 		uint_t	index;
1775 
1776 		if (ip_forwarding_table_v6[0] == NULL)
1777 			return (NULL);
1778 		irb_ptr = &(ip_forwarding_table_v6[0])[0];
1779 
1780 		/*
1781 		 * Keep a tab on the bucket while looking the IRE_DEFAULT
1782 		 * entries. We need to keep track of a particular IRE
1783 		 * (ire_origin) so this ensures that it will not be unlinked
1784 		 * from the hash list during the recursive lookup below.
1785 		 */
1786 		IRB_REFHOLD(irb_ptr);
1787 		ire = irb_ptr->irb_ire;
1788 		if (ire == NULL) {
1789 			IRB_REFRELE(irb_ptr);
1790 			return (NULL);
1791 		}
1792 
1793 		/*
1794 		 * Get the index first, since it can be changed by other
1795 		 * threads. Then get to the right default route skipping
1796 		 * default interface routes if any. As we hold a reference on
1797 		 * the IRE bucket, ipv6_ire_default_count can only increase so
1798 		 * we can't reach the end of the hash list unexpectedly.
1799 		 */
1800 		if (ipv6_ire_default_count != 0) {
1801 			g_index = ipv6_ire_default_index++;
1802 			index = g_index % ipv6_ire_default_count;
1803 			while (index != 0) {
1804 				if (!(ire->ire_type & IRE_INTERFACE))
1805 					index--;
1806 				ire = ire->ire_next;
1807 			}
1808 			ASSERT(ire != NULL);
1809 		} else {
1810 			/*
1811 			 * No default route, so we only have default interface
1812 			 * routes: don't enter the first loop.
1813 			 */
1814 			ire = NULL;
1815 		}
1816 
1817 		/*
1818 		 * Round-robin the default routers list looking for a neighbor
1819 		 * that matches the passed in parameters and is reachable.  If
1820 		 * none found, just return a route from the default router list
1821 		 * if it exists. If we can't find a default route (IRE_DEFAULT),
1822 		 * look for interface default routes.
1823 		 * We start with the ire we found above and we walk the hash
1824 		 * list until we're back where we started, see
1825 		 * ire_get_next_default_ire(). It doesn't matter if default
1826 		 * routes are added or deleted by other threads - we know this
1827 		 * ire will stay in the list because we hold a reference on the
1828 		 * ire bucket.
1829 		 * NB: if we only have interface default routes, ire is NULL so
1830 		 * we don't even enter this loop (see above).
1831 		 */
1832 		ire_origin = ire;
1833 		for (; ire != NULL;
1834 		    ire = ire_get_next_default_ire(ire, ire_origin)) {
1835 
1836 			if (ire_match_args_v6(ire, addr,
1837 			    &ipv6_all_zeros, gateway, type, ipif,
1838 			    zoneid, ihandle, tsl, flags)) {
1839 				int match_flags;
1840 
1841 				/*
1842 				 * We have something to work with.
1843 				 * If we can find a resolved/reachable
1844 				 * entry, we will use this. Otherwise
1845 				 * we'll try to find an entry that has
1846 				 * a resolved cache entry. We will fallback
1847 				 * on this if we don't find anything else.
1848 				 */
1849 				if (saved_ire == NULL)
1850 					saved_ire = ire;
1851 				mutex_enter(&ire->ire_lock);
1852 				gw_addr_v6 = ire->ire_gateway_addr_v6;
1853 				mutex_exit(&ire->ire_lock);
1854 				match_flags = MATCH_IRE_ILL_GROUP |
1855 				    MATCH_IRE_SECATTR;
1856 				rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
1857 				    0, ire->ire_ipif, zoneid, tsl, match_flags);
1858 				if (rire != NULL) {
1859 					nce = rire->ire_nce;
1860 					if (nce != NULL &&
1861 					    NCE_ISREACHABLE(nce) &&
1862 					    nce->nce_flags & NCE_F_ISROUTER) {
1863 						ire_refrele(rire);
1864 						IRE_REFHOLD(ire);
1865 						IRB_REFRELE(irb_ptr);
1866 						goto found_ire_held;
1867 					} else if (nce != NULL &&
1868 					    !(nce->nce_flags &
1869 					    NCE_F_ISROUTER)) {
1870 						/*
1871 						 * Make sure we don't use
1872 						 * this ire
1873 						 */
1874 						if (saved_ire == ire)
1875 							saved_ire = NULL;
1876 					}
1877 					ire_refrele(rire);
1878 				} else if (ipv6_ire_default_count > 1 &&
1879 				    zoneid != ALL_ZONES) {
1880 					/*
1881 					 * When we're in a local zone, we're
1882 					 * only interested in default routers
1883 					 * that are reachable through ipifs
1884 					 * within our zone.
1885 					 * The potentially expensive call to
1886 					 * ire_route_lookup_v6() is avoided when
1887 					 * we have only one default route.
1888 					 */
1889 					rire = ire_route_lookup_v6(&gw_addr_v6,
1890 					    NULL, NULL, 0, ire->ire_ipif, NULL,
1891 					    zoneid, tsl, match_flags);
1892 					if (rire != NULL) {
1893 						ire_refrele(rire);
1894 						saved_ire = ire;
1895 					} else if (saved_ire == ire) {
1896 						/*
1897 						 * Make sure we don't use
1898 						 * this ire
1899 						 */
1900 						saved_ire = NULL;
1901 					}
1902 				}
1903 			}
1904 		}
1905 		if (saved_ire != NULL) {
1906 			ire = saved_ire;
1907 			IRE_REFHOLD(ire);
1908 			IRB_REFRELE(irb_ptr);
1909 			goto found_ire_held;
1910 		} else {
1911 			/*
1912 			 * Look for a interface default route matching the
1913 			 * args passed in. No round robin here. Just pick
1914 			 * the right one.
1915 			 */
1916 			for (ire = irb_ptr->irb_ire; ire != NULL;
1917 			    ire = ire->ire_next) {
1918 
1919 				if (!(ire->ire_type & IRE_INTERFACE))
1920 					continue;
1921 
1922 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1923 					continue;
1924 
1925 				if (ire_match_args_v6(ire, addr,
1926 				    &ipv6_all_zeros, gateway, type, ipif,
1927 				    zoneid, ihandle, tsl, flags)) {
1928 					IRE_REFHOLD(ire);
1929 					IRB_REFRELE(irb_ptr);
1930 					goto found_ire_held;
1931 				}
1932 			}
1933 			IRB_REFRELE(irb_ptr);
1934 		}
1935 	}
1936 	ASSERT(ire == NULL);
1937 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1938 	return (NULL);
1939 found_ire:
1940 	ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0);
1941 	IRE_REFHOLD(ire);
1942 	rw_exit(&irb_ptr->irb_lock);
1943 
1944 found_ire_held:
1945 	if ((flags & MATCH_IRE_RJ_BHOLE) &&
1946 	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
1947 		return (ire);
1948 	}
1949 	/*
1950 	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
1951 	 * or IRE_CACHETABLE type.  If this is a recursive lookup and an
1952 	 * IRE_INTERFACE type was found, return that.  If it was some other
1953 	 * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it
1954 	 * is necessary to fill in the  parent IRE pointed to by pire, and
1955 	 * then lookup the gateway address of  the parent.  For backwards
1956 	 * compatiblity, if this lookup returns an
1957 	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
1958 	 * of lookup is done.
1959 	 */
1960 	if (flags & MATCH_IRE_RECURSIVE) {
1961 		const ipif_t *gw_ipif;
1962 		int match_flags = MATCH_IRE_DSTONLY;
1963 
1964 		if (ire->ire_type & IRE_INTERFACE)
1965 			return (ire);
1966 		if (pire != NULL)
1967 			*pire = ire;
1968 		/*
1969 		 * If we can't find an IRE_INTERFACE or the caller has not
1970 		 * asked for pire, we need to REFRELE the saved_ire.
1971 		 */
1972 		saved_ire = ire;
1973 
1974 		/*
1975 		 * Currently MATCH_IRE_ILL is never used with
1976 		 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
1977 		 * sending out packets as MATCH_IRE_ILL is used only
1978 		 * for communicating with on-link hosts. We can't assert
1979 		 * that here as RTM_GET calls this function with
1980 		 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
1981 		 * We have already used the MATCH_IRE_ILL in determining
1982 		 * the right prefix route at this point. To match the
1983 		 * behavior of how we locate routes while sending out
1984 		 * packets, we don't want to use MATCH_IRE_ILL below
1985 		 * while locating the interface route.
1986 		 */
1987 		if (ire->ire_ipif != NULL)
1988 			match_flags |= MATCH_IRE_ILL_GROUP;
1989 
1990 		mutex_enter(&ire->ire_lock);
1991 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1992 		mutex_exit(&ire->ire_lock);
1993 
1994 		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0,
1995 		    ire->ire_ipif, NULL, zoneid, tsl, match_flags);
1996 		if (ire == NULL) {
1997 			/*
1998 			 * In this case we have to deal with the
1999 			 * MATCH_IRE_PARENT flag, which means the
2000 			 * parent has to be returned if ire is NULL.
2001 			 * The aim of this is to have (at least) a starting
2002 			 * ire when we want to look at all of the ires in a
2003 			 * bucket aimed at a single destination (as is the
2004 			 * case in ip_newroute_v6 for the RTF_MULTIRT
2005 			 * flagged routes).
2006 			 */
2007 			if (flags & MATCH_IRE_PARENT) {
2008 				if (pire != NULL) {
2009 					/*
2010 					 * Need an extra REFHOLD, if the
2011 					 * parent ire is returned via both
2012 					 * ire and pire.
2013 					 */
2014 					IRE_REFHOLD(saved_ire);
2015 				}
2016 				ire = saved_ire;
2017 			} else {
2018 				ire_refrele(saved_ire);
2019 				if (pire != NULL)
2020 					*pire = NULL;
2021 			}
2022 			return (ire);
2023 		}
2024 		if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
2025 			/*
2026 			 * If the caller did not ask for pire, release
2027 			 * it now.
2028 			 */
2029 			if (pire == NULL) {
2030 				ire_refrele(saved_ire);
2031 			}
2032 			return (ire);
2033 		}
2034 		match_flags |= MATCH_IRE_TYPE;
2035 		mutex_enter(&ire->ire_lock);
2036 		gw_addr_v6 = ire->ire_gateway_addr_v6;
2037 		mutex_exit(&ire->ire_lock);
2038 		gw_ipif = ire->ire_ipif;
2039 		ire_refrele(ire);
2040 		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL,
2041 		    (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid,
2042 		    NULL, match_flags);
2043 		if (ire == NULL) {
2044 			/*
2045 			 * In this case we have to deal with the
2046 			 * MATCH_IRE_PARENT flag, which means the
2047 			 * parent has to be returned if ire is NULL.
2048 			 * The aim of this is to have (at least) a starting
2049 			 * ire when we want to look at all of the ires in a
2050 			 * bucket aimed at a single destination (as is the
2051 			 * case in ip_newroute_v6 for the RTF_MULTIRT
2052 			 * flagged routes).
2053 			 */
2054 			if (flags & MATCH_IRE_PARENT) {
2055 				if (pire != NULL) {
2056 					/*
2057 					 * Need an extra REFHOLD, if the
2058 					 * parent ire is returned via both
2059 					 * ire and pire.
2060 					 */
2061 					IRE_REFHOLD(saved_ire);
2062 				}
2063 				ire = saved_ire;
2064 			} else {
2065 				ire_refrele(saved_ire);
2066 				if (pire != NULL)
2067 					*pire = NULL;
2068 			}
2069 			return (ire);
2070 		} else if (pire == NULL) {
2071 			/*
2072 			 * If the caller did not ask for pire, release
2073 			 * it now.
2074 			 */
2075 			ire_refrele(saved_ire);
2076 		}
2077 		return (ire);
2078 	}
2079 
2080 	ASSERT(pire == NULL || *pire == NULL);
2081 	return (ire);
2082 }
2083 
2084 /*
2085  * Delete the IRE cache for the gateway and all IRE caches whose
2086  * ire_gateway_addr_v6 points to this gateway, and allow them to
2087  * be created on demand by ip_newroute_v6.
2088  */
2089 void
2090 ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid)
2091 {
2092 	irb_t *irb;
2093 	ire_t *ire;
2094 
2095 	irb = &ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, ip6_cache_table_size)];
2096 	IRB_REFHOLD(irb);
2097 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2098 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
2099 			continue;
2100 
2101 		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
2102 		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0,
2103 		    IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) {
2104 			ire_delete(ire);
2105 		}
2106 	}
2107 	IRB_REFRELE(irb);
2108 
2109 	ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid);
2110 }
2111 
2112 /*
2113  * Looks up cache table for a route.
2114  * specific lookup can be indicated by
2115  * passing the MATCH_* flags and the
2116  * necessary parameters.
2117  */
2118 ire_t *
2119 ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
2120     int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl,
2121     int flags)
2122 {
2123 	ire_t *ire;
2124 	irb_t *irb_ptr;
2125 	ASSERT(addr != NULL);
2126 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
2127 
2128 	/*
2129 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
2130 	 * MATCH_IRE_ILL is set.
2131 	 */
2132 	if ((flags & (MATCH_IRE_SRC |  MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
2133 	    (ipif == NULL))
2134 		return (NULL);
2135 
2136 	irb_ptr = &ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2137 	    ip6_cache_table_size)];
2138 	rw_enter(&irb_ptr->irb_lock, RW_READER);
2139 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
2140 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
2141 			continue;
2142 
2143 		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
2144 		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, gateway,
2145 		    type, ipif, zoneid, 0, tsl, flags)) {
2146 			IRE_REFHOLD(ire);
2147 			rw_exit(&irb_ptr->irb_lock);
2148 			return (ire);
2149 		}
2150 	}
2151 	rw_exit(&irb_ptr->irb_lock);
2152 	return (NULL);
2153 }
2154 
2155 /*
2156  * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
2157  * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
2158  * to the hidden ones.
2159  */
2160 ire_t *
2161 ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
2162     const ts_label_t *tsl)
2163 {
2164 	irb_t *irb_ptr;
2165 	ire_t *ire;
2166 
2167 	irb_ptr = &ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2168 	    ip6_cache_table_size)];
2169 	rw_enter(&irb_ptr->irb_lock, RW_READER);
2170 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
2171 		if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
2172 			continue;
2173 		if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
2174 			/*
2175 			 * Finally, check if the security policy has any
2176 			 * restriction on using this route for the specified
2177 			 * message.
2178 			 */
2179 			if (tsl != NULL &&
2180 			    ire->ire_gw_secattr != NULL &&
2181 			    tsol_ire_match_gwattr(ire, tsl) != 0) {
2182 				continue;
2183 			}
2184 
2185 			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
2186 			    ire->ire_zoneid == ALL_ZONES ||
2187 			    ire->ire_type == IRE_LOCAL) {
2188 				IRE_REFHOLD(ire);
2189 				rw_exit(&irb_ptr->irb_lock);
2190 				return (ire);
2191 			}
2192 		}
2193 	}
2194 	rw_exit(&irb_ptr->irb_lock);
2195 	return (NULL);
2196 }
2197 
2198 /*
2199  * Locate the interface ire that is tied to the cache ire 'cire' via
2200  * cire->ire_ihandle.
2201  *
2202  * We are trying to create the cache ire for an onlink destn. or
2203  * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER
2204  * case for xresolv interfaces, after the ire has come back from
2205  * an external resolver.
2206  */
2207 static ire_t *
2208 ire_ihandle_lookup_onlink_v6(ire_t *cire)
2209 {
2210 	ire_t	*ire;
2211 	int	match_flags;
2212 	int	i;
2213 	int	j;
2214 	irb_t	*irb_ptr;
2215 
2216 	ASSERT(cire != NULL);
2217 
2218 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
2219 	/*
2220 	 * We know that the mask of the interface ire equals cire->ire_cmask.
2221 	 * (When ip_newroute_v6() created 'cire' for an on-link destn.
2222 	 * it set its cmask from the interface ire's mask)
2223 	 */
2224 	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6,
2225 	    NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
2226 	    NULL, match_flags);
2227 	if (ire != NULL)
2228 		return (ire);
2229 	/*
2230 	 * If we didn't find an interface ire above, we can't declare failure.
2231 	 * For backwards compatibility, we need to support prefix routes
2232 	 * pointing to next hop gateways that are not on-link.
2233 	 *
2234 	 * In the resolver/noresolver case, ip_newroute_v6() thinks
2235 	 * it is creating the cache ire for an onlink destination in 'cire'.
2236 	 * But 'cire' is not actually onlink, because ire_ftable_lookup_v6()
2237 	 * cheated it, by doing ire_route_lookup_v6() twice and returning an
2238 	 * interface ire.
2239 	 *
2240 	 * Eg. default	-	gw1			(line 1)
2241 	 *	gw1	-	gw2			(line 2)
2242 	 *	gw2	-	hme0			(line 3)
2243 	 *
2244 	 * In the above example, ip_newroute_v6() tried to create the cache ire
2245 	 * 'cire' for gw1, based on the interface route in line 3. The
2246 	 * ire_ftable_lookup_v6() above fails, because there is
2247 	 * no interface route to reach gw1. (it is gw2). We fall thru below.
2248 	 *
2249 	 * Do a brute force search based on the ihandle in a subset of the
2250 	 * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise
2251 	 * things become very complex, since we don't have 'pire' in this
2252 	 * case. (Also note that this method is not possible in the offlink
2253 	 * case because we don't know the mask)
2254 	 */
2255 	i = ip_mask_to_plen_v6(&cire->ire_cmask_v6);
2256 	if ((ip_forwarding_table_v6[i]) == NULL)
2257 		return (NULL);
2258 	for (j = 0; j < ip6_ftable_hash_size; j++) {
2259 		irb_ptr = &ip_forwarding_table_v6[i][j];
2260 		rw_enter(&irb_ptr->irb_lock, RW_READER);
2261 		for (ire = irb_ptr->irb_ire; ire != NULL;
2262 		    ire = ire->ire_next) {
2263 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
2264 				continue;
2265 			if ((ire->ire_type & IRE_INTERFACE) &&
2266 			    (ire->ire_ihandle == cire->ire_ihandle)) {
2267 				IRE_REFHOLD(ire);
2268 				rw_exit(&irb_ptr->irb_lock);
2269 				return (ire);
2270 			}
2271 		}
2272 		rw_exit(&irb_ptr->irb_lock);
2273 	}
2274 	return (NULL);
2275 }
2276 
2277 
2278 /*
2279  * Locate the interface ire that is tied to the cache ire 'cire' via
2280  * cire->ire_ihandle.
2281  *
2282  * We are trying to create the cache ire for an offlink destn based
2283  * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
2284  * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in
2285  * the IRE_CACHE case.
2286  */
2287 ire_t *
2288 ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
2289 {
2290 	ire_t	*ire;
2291 	int	match_flags;
2292 	in6_addr_t	gw_addr;
2293 	ipif_t		*gw_ipif;
2294 
2295 	ASSERT(cire != NULL && pire != NULL);
2296 
2297 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
2298 	/*
2299 	 * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only
2300 	 * for on-link hosts. We should never be here for onlink.
2301 	 * Thus, use MATCH_IRE_ILL_GROUP.
2302 	 */
2303 	if (pire->ire_ipif != NULL)
2304 		match_flags |= MATCH_IRE_ILL_GROUP;
2305 	/*
2306 	 * We know that the mask of the interface ire equals cire->ire_cmask.
2307 	 * (When ip_newroute_v6() created 'cire' for an on-link destn. it set
2308 	 * its cmask from the interface ire's mask)
2309 	 */
2310 	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0,
2311 	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
2312 	    NULL, match_flags);
2313 	if (ire != NULL)
2314 		return (ire);
2315 	/*
2316 	 * If we didn't find an interface ire above, we can't declare failure.
2317 	 * For backwards compatibility, we need to support prefix routes
2318 	 * pointing to next hop gateways that are not on-link.
2319 	 *
2320 	 * Assume we are trying to ping some offlink destn, and we have the
2321 	 * routing table below.
2322 	 *
2323 	 * Eg.	default	- gw1		<--- pire	(line 1)
2324 	 *	gw1	- gw2				(line 2)
2325 	 *	gw2	- hme0				(line 3)
2326 	 *
2327 	 * If we already have a cache ire for gw1 in 'cire', the
2328 	 * ire_ftable_lookup_v6 above would have failed, since there is no
2329 	 * interface ire to reach gw1. We will fallthru below.
2330 	 *
2331 	 * Here we duplicate the steps that ire_ftable_lookup_v6() did in
2332 	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
2333 	 * The differences are the following
2334 	 * i.   We want the interface ire only, so we call
2335 	 *	ire_ftable_lookup_v6() instead of ire_route_lookup_v6()
2336 	 * ii.  We look for only prefix routes in the 1st call below.
2337 	 * ii.  We want to match on the ihandle in the 2nd call below.
2338 	 */
2339 	match_flags =  MATCH_IRE_TYPE;
2340 	if (pire->ire_ipif != NULL)
2341 		match_flags |= MATCH_IRE_ILL_GROUP;
2342 
2343 	mutex_enter(&pire->ire_lock);
2344 	gw_addr = pire->ire_gateway_addr_v6;
2345 	mutex_exit(&pire->ire_lock);
2346 	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET,
2347 	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags);
2348 	if (ire == NULL)
2349 		return (NULL);
2350 	/*
2351 	 * At this point 'ire' corresponds to the entry shown in line 2.
2352 	 * gw_addr is 'gw2' in the example above.
2353 	 */
2354 	mutex_enter(&ire->ire_lock);
2355 	gw_addr = ire->ire_gateway_addr_v6;
2356 	mutex_exit(&ire->ire_lock);
2357 	gw_ipif = ire->ire_ipif;
2358 	ire_refrele(ire);
2359 
2360 	match_flags |= MATCH_IRE_IHANDLE;
2361 	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE,
2362 	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
2363 	    NULL, match_flags);
2364 	return (ire);
2365 }
2366 
2367 /*
2368  * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
2369  * ire associated with the specified ipif.
2370  *
2371  * This might occasionally be called when IPIF_UP is not set since
2372  * the IPV6_MULTICAST_IF as well as creating interface routes
2373  * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
2374  *
2375  * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
2376  * the ipif this routine might return NULL.
2377  * (Sometimes called as writer though not required by this function.)
2378  */
2379 ire_t *
2380 ipif_to_ire_v6(const ipif_t *ipif)
2381 {
2382 	ire_t	*ire;
2383 
2384 	ASSERT(ipif->ipif_isv6);
2385 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
2386 		ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
2387 		    IRE_LOOPBACK, ipif, ALL_ZONES, NULL,
2388 		    (MATCH_IRE_TYPE | MATCH_IRE_IPIF));
2389 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
2390 		/* In this case we need to lookup destination address. */
2391 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
2392 		    &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
2393 		    0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
2394 		    MATCH_IRE_MASK));
2395 	} else {
2396 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
2397 		    &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
2398 		    ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
2399 		    MATCH_IRE_MASK));
2400 	}
2401 	return (ire);
2402 }
2403 
2404 /*
2405  * Return B_TRUE if a multirt route is resolvable
2406  * (or if no route is resolved yet), B_FALSE otherwise.
2407  * This only works in the global zone.
2408  */
2409 boolean_t
2410 ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl)
2411 {
2412 	ire_t	*first_fire;
2413 	ire_t	*first_cire;
2414 	ire_t	*fire;
2415 	ire_t	*cire;
2416 	irb_t	*firb;
2417 	irb_t	*cirb;
2418 	int	unres_cnt = 0;
2419 	boolean_t resolvable = B_FALSE;
2420 
2421 	/* Retrieve the first IRE_HOST that matches the destination */
2422 	first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST,
2423 	    NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE |
2424 	    MATCH_IRE_SECATTR);
2425 
2426 	/* No route at all */
2427 	if (first_fire == NULL) {
2428 		return (B_TRUE);
2429 	}
2430 
2431 	firb = first_fire->ire_bucket;
2432 	ASSERT(firb);
2433 
2434 	/* Retrieve the first IRE_CACHE ire for that destination. */
2435 	first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl);
2436 
2437 	/* No resolved route. */
2438 	if (first_cire == NULL) {
2439 		ire_refrele(first_fire);
2440 		return (B_TRUE);
2441 	}
2442 
2443 	/* At least one route is resolved. */
2444 
2445 	cirb = first_cire->ire_bucket;
2446 	ASSERT(cirb);
2447 
2448 	/* Count the number of routes to that dest that are declared. */
2449 	IRB_REFHOLD(firb);
2450 	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
2451 		if (!(fire->ire_flags & RTF_MULTIRT))
2452 			continue;
2453 		if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp))
2454 			continue;
2455 		unres_cnt++;
2456 	}
2457 	IRB_REFRELE(firb);
2458 
2459 
2460 	/* Then subtract the number of routes to that dst that are resolved */
2461 	IRB_REFHOLD(cirb);
2462 	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
2463 	    if (!(cire->ire_flags & RTF_MULTIRT))
2464 		continue;
2465 	    if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
2466 		continue;
2467 	    if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
2468 		continue;
2469 	    unres_cnt--;
2470 	}
2471 	IRB_REFRELE(cirb);
2472 
2473 	/* At least one route is unresolved; search for a resolvable route. */
2474 	if (unres_cnt > 0)
2475 		resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire,
2476 		    MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl);
2477 
2478 	if (first_fire)
2479 		ire_refrele(first_fire);
2480 
2481 	if (first_cire)
2482 		ire_refrele(first_cire);
2483 
2484 	return (resolvable);
2485 }
2486 
2487 
2488 /*
2489  * Return B_TRUE and update *ire_arg and *fire_arg
2490  * if at least one resolvable route is found.
2491  * Return B_FALSE otherwise (all routes are resolved or
2492  * the remaining unresolved routes are all unresolvable).
2493  * This only works in the global zone.
2494  */
2495 boolean_t
2496 ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
2497     const ts_label_t *tsl)
2498 {
2499 	clock_t	delta;
2500 	ire_t	*best_fire = NULL;
2501 	ire_t	*best_cire = NULL;
2502 	ire_t	*first_fire;
2503 	ire_t	*first_cire;
2504 	ire_t	*fire;
2505 	ire_t	*cire;
2506 	irb_t	*firb = NULL;
2507 	irb_t	*cirb = NULL;
2508 	ire_t	*gw_ire;
2509 	boolean_t	already_resolved;
2510 	boolean_t	res;
2511 	in6_addr_t	v6dst;
2512 	in6_addr_t	v6gw;
2513 
2514 	ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, "
2515 	    "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags));
2516 
2517 	ASSERT(ire_arg);
2518 	ASSERT(fire_arg);
2519 
2520 	/* Not an IRE_HOST ire; give up. */
2521 	if ((*fire_arg == NULL) ||
2522 	    ((*fire_arg)->ire_type != IRE_HOST)) {
2523 		return (B_FALSE);
2524 	}
2525 
2526 	/* This is the first IRE_HOST ire for that destination. */
2527 	first_fire = *fire_arg;
2528 	firb = first_fire->ire_bucket;
2529 	ASSERT(firb);
2530 
2531 	mutex_enter(&first_fire->ire_lock);
2532 	v6dst = first_fire->ire_addr_v6;
2533 	mutex_exit(&first_fire->ire_lock);
2534 
2535 	ip2dbg(("ire_multirt_lookup_v6: dst %08x\n",
2536 	    ntohl(V4_PART_OF_V6(v6dst))));
2537 
2538 	/*
2539 	 * Retrieve the first IRE_CACHE ire for that destination;
2540 	 * if we don't find one, no route for that dest is
2541 	 * resolved yet.
2542 	 */
2543 	first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl);
2544 	if (first_cire) {
2545 		cirb = first_cire->ire_bucket;
2546 	}
2547 
2548 	ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire));
2549 
2550 	/*
2551 	 * Search for a resolvable route, giving the top priority
2552 	 * to routes that can be resolved without any call to the resolver.
2553 	 */
2554 	IRB_REFHOLD(firb);
2555 
2556 	if (!IN6_IS_ADDR_MULTICAST(&v6dst)) {
2557 		/*
2558 		 * For all multiroute IRE_HOST ires for that destination,
2559 		 * check if the route via the IRE_HOST's gateway is
2560 		 * resolved yet.
2561 		 */
2562 		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
2563 
2564 			if (!(fire->ire_flags & RTF_MULTIRT))
2565 				continue;
2566 			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
2567 				continue;
2568 
2569 			if (fire->ire_gw_secattr != NULL &&
2570 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
2571 				continue;
2572 			}
2573 
2574 			mutex_enter(&fire->ire_lock);
2575 			v6gw = fire->ire_gateway_addr_v6;
2576 			mutex_exit(&fire->ire_lock);
2577 
2578 			ip2dbg(("ire_multirt_lookup_v6: fire %p, "
2579 			    "ire_addr %08x, ire_gateway_addr %08x\n",
2580 			    (void *)fire,
2581 			    ntohl(V4_PART_OF_V6(fire->ire_addr_v6)),
2582 			    ntohl(V4_PART_OF_V6(v6gw))));
2583 
2584 			already_resolved = B_FALSE;
2585 
2586 			if (first_cire) {
2587 				ASSERT(cirb);
2588 
2589 				IRB_REFHOLD(cirb);
2590 				/*
2591 				 * For all IRE_CACHE ires for that
2592 				 * destination.
2593 				 */
2594 				for (cire = first_cire;
2595 				    cire != NULL;
2596 				    cire = cire->ire_next) {
2597 
2598 					if (!(cire->ire_flags & RTF_MULTIRT))
2599 						continue;
2600 					if (!IN6_ARE_ADDR_EQUAL(
2601 					    &cire->ire_addr_v6, &v6dst))
2602 						continue;
2603 					if (cire->ire_marks &
2604 					    (IRE_MARK_CONDEMNED|
2605 						IRE_MARK_HIDDEN))
2606 						continue;
2607 
2608 					if (cire->ire_gw_secattr != NULL &&
2609 					    tsol_ire_match_gwattr(cire,
2610 					    tsl) != 0) {
2611 						continue;
2612 					}
2613 
2614 					/*
2615 					 * Check if the IRE_CACHE's gateway
2616 					 * matches the IRE_HOST's gateway.
2617 					 */
2618 					if (IN6_ARE_ADDR_EQUAL(
2619 					    &cire->ire_gateway_addr_v6,
2620 					    &v6gw)) {
2621 						already_resolved = B_TRUE;
2622 						break;
2623 					}
2624 				}
2625 				IRB_REFRELE(cirb);
2626 			}
2627 
2628 			/*
2629 			 * This route is already resolved;
2630 			 * proceed with next one.
2631 			 */
2632 			if (already_resolved) {
2633 				ip2dbg(("ire_multirt_lookup_v6: found cire %p, "
2634 				    "already resolved\n", (void *)cire));
2635 				continue;
2636 			}
2637 
2638 			/*
2639 			 * The route is unresolved; is it actually
2640 			 * resolvable, i.e. is there a cache or a resolver
2641 			 * for the gateway?
2642 			 */
2643 			gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL,
2644 			    ALL_ZONES, tsl, MATCH_IRE_RECURSIVE |
2645 			    MATCH_IRE_SECATTR);
2646 
2647 			ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n",
2648 			    (void *)gw_ire));
2649 
2650 			/*
2651 			 * This route can be resolved without any call to the
2652 			 * resolver; if the MULTIRT_CACHEGW flag is set,
2653 			 * give the top priority to this ire and exit the
2654 			 * loop.
2655 			 * This occurs when an resolver reply is processed
2656 			 * through ip_wput_nondata()
2657 			 */
2658 			if ((flags & MULTIRT_CACHEGW) &&
2659 			    (gw_ire != NULL) &&
2660 			    (gw_ire->ire_type & IRE_CACHETABLE)) {
2661 				/*
2662 				 * Release the resolver associated to the
2663 				 * previous candidate best ire, if any.
2664 				 */
2665 				if (best_cire) {
2666 					ire_refrele(best_cire);
2667 					ASSERT(best_fire);
2668 				}
2669 
2670 				best_fire = fire;
2671 				best_cire = gw_ire;
2672 
2673 				ip2dbg(("ire_multirt_lookup_v6: found top prio "
2674 				    "best_fire %p, best_cire %p\n",
2675 				    (void *)best_fire, (void *)best_cire));
2676 				break;
2677 			}
2678 
2679 			/*
2680 			 * Compute the time elapsed since our preceding
2681 			 * attempt to  resolve that route.
2682 			 * If the MULTIRT_USESTAMP flag is set, we take that
2683 			 * route into account only if this time interval
2684 			 * exceeds ip_multirt_resolution_interval;
2685 			 * this prevents us from attempting to resolve a
2686 			 * broken route upon each sending of a packet.
2687 			 */
2688 			delta = lbolt - fire->ire_last_used_time;
2689 			delta = TICK_TO_MSEC(delta);
2690 
2691 			res = (boolean_t)
2692 			    ((delta > ip_multirt_resolution_interval) ||
2693 				(!(flags & MULTIRT_USESTAMP)));
2694 
2695 			ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, "
2696 			    "res %d\n",
2697 			    (void *)fire, delta, res));
2698 
2699 			if (res) {
2700 				/*
2701 				 * A resolver exists for the gateway: save
2702 				 * the current IRE_HOST ire as a candidate
2703 				 * best ire. If we later discover that a
2704 				 * top priority ire exists (i.e. no need to
2705 				 * call the resolver), then this new ire
2706 				 * will be preferred to the current one.
2707 				 */
2708 				if (gw_ire != NULL) {
2709 					if (best_fire == NULL) {
2710 						ASSERT(best_cire == NULL);
2711 
2712 						best_fire = fire;
2713 						best_cire = gw_ire;
2714 
2715 						ip2dbg(("ire_multirt_lookup_v6:"
2716 						    "found candidate "
2717 						    "best_fire %p, "
2718 						    "best_cire %p\n",
2719 						    (void *)best_fire,
2720 						    (void *)best_cire));
2721 
2722 						/*
2723 						 * If MULTIRT_CACHEGW is not
2724 						 * set, we ignore the top
2725 						 * priority ires that can
2726 						 * be resolved without any
2727 						 * call to the resolver;
2728 						 * In that case, there is
2729 						 * actually no need
2730 						 * to continue the loop.
2731 						 */
2732 						if (!(flags &
2733 						    MULTIRT_CACHEGW)) {
2734 							break;
2735 						}
2736 						continue;
2737 					}
2738 				} else {
2739 					/*
2740 					 * No resolver for the gateway: the
2741 					 * route is not resolvable.
2742 					 * If the MULTIRT_SETSTAMP flag is
2743 					 * set, we stamp the IRE_HOST ire,
2744 					 * so we will not select it again
2745 					 * during this resolution interval.
2746 					 */
2747 					if (flags & MULTIRT_SETSTAMP)
2748 						fire->ire_last_used_time =
2749 						    lbolt;
2750 				}
2751 			}
2752 
2753 			if (gw_ire != NULL)
2754 				ire_refrele(gw_ire);
2755 		}
2756 	} else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */
2757 
2758 		for (fire = first_fire;
2759 		    fire != NULL;
2760 		    fire = fire->ire_next) {
2761 
2762 			if (!(fire->ire_flags & RTF_MULTIRT))
2763 				continue;
2764 			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
2765 				continue;
2766 
2767 			if (fire->ire_gw_secattr != NULL &&
2768 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
2769 				continue;
2770 			}
2771 
2772 			already_resolved = B_FALSE;
2773 
2774 			mutex_enter(&fire->ire_lock);
2775 			v6gw = fire->ire_gateway_addr_v6;
2776 			mutex_exit(&fire->ire_lock);
2777 
2778 			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
2779 			    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl,
2780 			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
2781 			    MATCH_IRE_SECATTR);
2782 
2783 			/* No resolver for the gateway; we skip this ire. */
2784 			if (gw_ire == NULL) {
2785 				continue;
2786 			}
2787 
2788 			if (first_cire) {
2789 
2790 				IRB_REFHOLD(cirb);
2791 				/*
2792 				 * For all IRE_CACHE ires for that
2793 				 * destination.
2794 				 */
2795 				for (cire = first_cire;
2796 				    cire != NULL;
2797 				    cire = cire->ire_next) {
2798 
2799 					if (!(cire->ire_flags & RTF_MULTIRT))
2800 						continue;
2801 					if (!IN6_ARE_ADDR_EQUAL(
2802 					    &cire->ire_addr_v6, &v6dst))
2803 						continue;
2804 					if (cire->ire_marks &
2805 					    (IRE_MARK_CONDEMNED|
2806 						IRE_MARK_HIDDEN))
2807 						continue;
2808 
2809 					if (cire->ire_gw_secattr != NULL &&
2810 					    tsol_ire_match_gwattr(cire,
2811 					    tsl) != 0) {
2812 						continue;
2813 					}
2814 
2815 					/*
2816 					 * Cache entries are linked to the
2817 					 * parent routes using the parent handle
2818 					 * (ire_phandle). If no cache entry has
2819 					 * the same handle as fire, fire is
2820 					 * still unresolved.
2821 					 */
2822 					ASSERT(cire->ire_phandle != 0);
2823 					if (cire->ire_phandle ==
2824 					    fire->ire_phandle) {
2825 						already_resolved = B_TRUE;
2826 						break;
2827 					}
2828 				}
2829 				IRB_REFRELE(cirb);
2830 			}
2831 
2832 			/*
2833 			 * This route is already resolved; proceed with
2834 			 * next one.
2835 			 */
2836 			if (already_resolved) {
2837 				ire_refrele(gw_ire);
2838 				continue;
2839 			}
2840 
2841 			/*
2842 			 * Compute the time elapsed since our preceding
2843 			 * attempt to resolve that route.
2844 			 * If the MULTIRT_USESTAMP flag is set, we take
2845 			 * that route into account only if this time
2846 			 * interval exceeds ip_multirt_resolution_interval;
2847 			 * this prevents us from attempting to resolve a
2848 			 * broken route upon each sending of a packet.
2849 			 */
2850 			delta = lbolt - fire->ire_last_used_time;
2851 			delta = TICK_TO_MSEC(delta);
2852 
2853 			res = (boolean_t)
2854 			    ((delta > ip_multirt_resolution_interval) ||
2855 			    (!(flags & MULTIRT_USESTAMP)));
2856 
2857 			ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, "
2858 			    "flags %04x, res %d\n",
2859 			    (void *)fire, delta, flags, res));
2860 
2861 			if (res) {
2862 				if (best_cire) {
2863 					/*
2864 					 * Release the resolver associated
2865 					 * to the preceding candidate best
2866 					 * ire, if any.
2867 					 */
2868 					ire_refrele(best_cire);
2869 					ASSERT(best_fire);
2870 				}
2871 				best_fire = fire;
2872 				best_cire = gw_ire;
2873 				continue;
2874 			}
2875 
2876 			ire_refrele(gw_ire);
2877 		}
2878 	}
2879 
2880 	if (best_fire) {
2881 		IRE_REFHOLD(best_fire);
2882 	}
2883 	IRB_REFRELE(firb);
2884 
2885 	/* Release the first IRE_CACHE we initially looked up, if any. */
2886 	if (first_cire)
2887 		ire_refrele(first_cire);
2888 
2889 	/* Found a resolvable route. */
2890 	if (best_fire) {
2891 		ASSERT(best_cire);
2892 
2893 		if (*fire_arg)
2894 			ire_refrele(*fire_arg);
2895 		if (*ire_arg)
2896 			ire_refrele(*ire_arg);
2897 
2898 		/*
2899 		 * Update the passed arguments with the
2900 		 * resolvable multirt route we found
2901 		 */
2902 		*fire_arg = best_fire;
2903 		*ire_arg = best_cire;
2904 
2905 		ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, "
2906 		    "*fire_arg %p, *ire_arg %p\n",
2907 		    (void *)best_fire, (void *)best_cire));
2908 
2909 		return (B_TRUE);
2910 	}
2911 
2912 	ASSERT(best_cire == NULL);
2913 
2914 	ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, "
2915 	    "*ire_arg %p\n",
2916 	    (void *)*fire_arg, (void *)*ire_arg));
2917 
2918 	/* No resolvable route. */
2919 	return (B_FALSE);
2920 }
2921 
2922 
2923 /*
2924  * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp'
2925  * that goes through 'ipif'. As a fallback, a route that goes through
2926  * ipif->ipif_ill can be returned.
2927  */
2928 ire_t *
2929 ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp)
2930 {
2931 	ire_t	*ire;
2932 	ire_t	*save_ire = NULL;
2933 	ire_t   *gw_ire;
2934 	irb_t   *irb;
2935 	in6_addr_t v6gw;
2936 	int	match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
2937 
2938 	ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
2939 	    NULL, MATCH_IRE_DEFAULT);
2940 
2941 	if (ire == NULL)
2942 		return (NULL);
2943 
2944 	irb = ire->ire_bucket;
2945 	ASSERT(irb);
2946 
2947 	IRB_REFHOLD(irb);
2948 	ire_refrele(ire);
2949 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2950 		if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) ||
2951 		    (ipif->ipif_zoneid != ire->ire_zoneid &&
2952 		    ire->ire_zoneid != ALL_ZONES)) {
2953 			continue;
2954 		}
2955 
2956 		switch (ire->ire_type) {
2957 		case IRE_DEFAULT:
2958 		case IRE_PREFIX:
2959 		case IRE_HOST:
2960 			mutex_enter(&ire->ire_lock);
2961 			v6gw = ire->ire_gateway_addr_v6;
2962 			mutex_exit(&ire->ire_lock);
2963 			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
2964 			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
2965 			    NULL, match_flags);
2966 
2967 			if (gw_ire != NULL) {
2968 				if (save_ire != NULL) {
2969 					ire_refrele(save_ire);
2970 				}
2971 				IRE_REFHOLD(ire);
2972 				if (gw_ire->ire_ipif == ipif) {
2973 					ire_refrele(gw_ire);
2974 
2975 					IRB_REFRELE(irb);
2976 					return (ire);
2977 				}
2978 				ire_refrele(gw_ire);
2979 				save_ire = ire;
2980 			}
2981 			break;
2982 		case IRE_IF_NORESOLVER:
2983 		case IRE_IF_RESOLVER:
2984 			if (ire->ire_ipif == ipif) {
2985 				if (save_ire != NULL) {
2986 					ire_refrele(save_ire);
2987 				}
2988 				IRE_REFHOLD(ire);
2989 
2990 				IRB_REFRELE(irb);
2991 				return (ire);
2992 			}
2993 			break;
2994 		}
2995 	}
2996 	IRB_REFRELE(irb);
2997 
2998 	return (save_ire);
2999 }
3000