xref: /titanic_50/usr/src/uts/common/inet/ip/ip6_ire.c (revision a42ff480eab7fd4f2b53fe8e9bdb1b57f0cf64da)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 1990 Mentat Inc.
27  */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 /*
32  * This file contains routines that manipulate Internet Routing Entries (IREs).
33  */
34 #include <sys/types.h>
35 #include <sys/stream.h>
36 #include <sys/stropts.h>
37 #include <sys/ddi.h>
38 #include <sys/cmn_err.h>
39 
40 #include <sys/systm.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <net/if.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <net/if_dl.h>
47 #include <netinet/ip6.h>
48 #include <netinet/icmp6.h>
49 
50 #include <inet/common.h>
51 #include <inet/mi.h>
52 #include <inet/ip.h>
53 #include <inet/ip6.h>
54 #include <inet/ip_ndp.h>
55 #include <inet/ip_if.h>
56 #include <inet/ip_ire.h>
57 #include <inet/ipclassifier.h>
58 #include <inet/nd.h>
59 #include <sys/kmem.h>
60 #include <sys/zone.h>
61 
62 #include <sys/tsol/label.h>
63 #include <sys/tsol/tnet.h>
64 
65 static	ire_t	ire_null;
66 
67 static ire_t	*ire_ihandle_lookup_onlink_v6(ire_t *cire);
68 static	void	ire_report_ftable_v6(ire_t *ire, char *mp);
69 static	void	ire_report_ctable_v6(ire_t *ire, char *mp);
70 static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr,
71     const in6_addr_t *mask, const in6_addr_t *gateway, int type,
72     const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
73     const ts_label_t *tsl, int match_flags);
74 static	ire_t	*ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
75     const in6_addr_t *, const in6_addr_t *, uint_t *, queue_t *, queue_t *,
76     ushort_t, ipif_t *, const in6_addr_t *, uint32_t, uint32_t, uint_t,
77     const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
78 
79 /*
80  * Named Dispatch routine to produce a formatted report on all IREs.
81  * This report is accessed by using the ndd utility to "get" ND variable
82  * "ip_ire_status_v6".
83  */
84 /* ARGSUSED */
85 int
86 ip_ire_report_v6(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
87 {
88 	zoneid_t zoneid;
89 	ip_stack_t *ipst;
90 
91 	(void) mi_mpprintf(mp,
92 	    "IRE      " MI_COL_HDRPAD_STR
93 	    "rfq      " MI_COL_HDRPAD_STR
94 	    "stq      " MI_COL_HDRPAD_STR
95 	    " zone mxfrg rtt   rtt_sd ssthresh ref "
96 	    "rtomax tstamp_ok wscale_ok ecn_ok pmtud_ok sack sendpipe recvpipe "
97 	    "in/out/forward type    addr         mask         "
98 	    "src             gateway");
99 	/*
100 	 *   01234567 01234567 01234567 12345 12345 12345 12345  12345678 123
101 	 *   123456 123456789 123456789 123456 12345678 1234 12345678 12345678
102 	 *   in/out/forward xxxxxxxxxx
103 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
104 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
105 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
106 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
107 	 */
108 
109 	/*
110 	 * Because of the ndd constraint, at most we can have 64K buffer
111 	 * to put in all IRE info.  So to be more efficient, just
112 	 * allocate a 64K buffer here, assuming we need that large buffer.
113 	 * This should be OK as only root can do ndd /dev/ip.
114 	 */
115 	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
116 		/* The following may work even if we cannot get a large buf. */
117 		(void) mi_mpprintf(mp, "<< Out of buffer >>\n");
118 		return (0);
119 	}
120 	zoneid = Q_TO_CONN(q)->conn_zoneid;
121 	if (zoneid == GLOBAL_ZONEID)
122 		zoneid = ALL_ZONES;
123 	ipst = CONNQ_TO_IPST(q);
124 
125 	ire_walk_v6(ire_report_ftable_v6, (char *)mp->b_cont, zoneid, ipst);
126 	ire_walk_v6(ire_report_ctable_v6, (char *)mp->b_cont, zoneid, ipst);
127 	return (0);
128 }
129 
130 /*
131  * ire_walk routine invoked for ip_ire_report_v6 for each IRE.
132  */
133 static void
134 ire_report_ftable_v6(ire_t *ire, char *mp)
135 {
136 	char	buf1[INET6_ADDRSTRLEN];
137 	char	buf2[INET6_ADDRSTRLEN];
138 	char	buf3[INET6_ADDRSTRLEN];
139 	char	buf4[INET6_ADDRSTRLEN];
140 	uint_t	fo_pkt_count;
141 	uint_t	ib_pkt_count;
142 	int	ref;
143 	in6_addr_t gw_addr_v6;
144 	uint_t	print_len, buf_len;
145 
146 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
147 	if (ire->ire_type & IRE_CACHETABLE)
148 		return;
149 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
150 	if (buf_len <= 0)
151 		return;
152 
153 	/* Number of active references of this ire */
154 	ref = ire->ire_refcnt;
155 	/* "inbound" to a non local address is a forward */
156 	ib_pkt_count = ire->ire_ib_pkt_count;
157 	fo_pkt_count = 0;
158 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
159 	if (!(ire->ire_type & (IRE_LOCAL|IRE_BROADCAST))) {
160 		fo_pkt_count = ib_pkt_count;
161 		ib_pkt_count = 0;
162 	}
163 
164 	mutex_enter(&ire->ire_lock);
165 	gw_addr_v6 = ire->ire_gateway_addr_v6;
166 	mutex_exit(&ire->ire_lock);
167 
168 	print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
169 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
170 	    "%05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
171 	    "%04d %08d %08d %d/%d/%d %s\n\t%s\n\t%s\n\t%s\n\t%s\n",
172 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
173 	    (int)ire->ire_zoneid,
174 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
175 	    ire->ire_uinfo.iulp_rtt_sd,
176 	    ire->ire_uinfo.iulp_ssthresh, ref,
177 	    ire->ire_uinfo.iulp_rtomax,
178 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
179 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
180 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
181 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
182 	    ire->ire_uinfo.iulp_sack,
183 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
184 	    ib_pkt_count, ire->ire_ob_pkt_count, fo_pkt_count,
185 	    ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type),
186 	    inet_ntop(AF_INET6, &ire->ire_addr_v6, buf1, sizeof (buf1)),
187 	    inet_ntop(AF_INET6, &ire->ire_mask_v6, buf2, sizeof (buf2)),
188 	    inet_ntop(AF_INET6, &ire->ire_src_addr_v6, buf3, sizeof (buf3)),
189 	    inet_ntop(AF_INET6, &gw_addr_v6, buf4, sizeof (buf4)));
190 	if (print_len < buf_len) {
191 		((mblk_t *)mp)->b_wptr += print_len;
192 	} else {
193 		((mblk_t *)mp)->b_wptr += buf_len;
194 	}
195 }
196 
197 /* ire_walk routine invoked for ip_ire_report_v6 for each IRE. */
198 static void
199 ire_report_ctable_v6(ire_t *ire, char *mp)
200 {
201 	char	buf1[INET6_ADDRSTRLEN];
202 	char	buf2[INET6_ADDRSTRLEN];
203 	char	buf3[INET6_ADDRSTRLEN];
204 	char	buf4[INET6_ADDRSTRLEN];
205 	uint_t	fo_pkt_count;
206 	uint_t	ib_pkt_count;
207 	int	ref;
208 	in6_addr_t gw_addr_v6;
209 	uint_t	print_len, buf_len;
210 
211 	if ((ire->ire_type & IRE_CACHETABLE) == 0)
212 		return;
213 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
214 	if (buf_len <= 0)
215 		return;
216 
217 	/* Number of active references of this ire */
218 	ref = ire->ire_refcnt;
219 	/* "inbound" to a non local address is a forward */
220 	ib_pkt_count = ire->ire_ib_pkt_count;
221 	fo_pkt_count = 0;
222 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
223 	if (ire->ire_type & IRE_LOCAL) {
224 		fo_pkt_count = ib_pkt_count;
225 		ib_pkt_count = 0;
226 	}
227 
228 	mutex_enter(&ire->ire_lock);
229 	gw_addr_v6 = ire->ire_gateway_addr_v6;
230 	mutex_exit(&ire->ire_lock);
231 
232 	print_len =  snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
233 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
234 	    "%05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
235 	    "%04d %08d %08d %d/%d/%d %s\n\t%s\n\t%s\n\t%s\n\t%s\n",
236 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
237 	    (int)ire->ire_zoneid,
238 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
239 	    ire->ire_uinfo.iulp_rtt_sd, ire->ire_uinfo.iulp_ssthresh, ref,
240 	    ire->ire_uinfo.iulp_rtomax,
241 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
242 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
243 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
244 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
245 	    ire->ire_uinfo.iulp_sack,
246 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
247 	    ib_pkt_count, ire->ire_ob_pkt_count,
248 	    fo_pkt_count, ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type),
249 	    inet_ntop(AF_INET6, &ire->ire_addr_v6, buf1, sizeof (buf1)),
250 	    inet_ntop(AF_INET6, &ire->ire_mask_v6, buf2, sizeof (buf2)),
251 	    inet_ntop(AF_INET6, &ire->ire_src_addr_v6, buf3, sizeof (buf3)),
252 	    inet_ntop(AF_INET6, &gw_addr_v6, buf4, sizeof (buf4)));
253 	if (print_len < buf_len) {
254 		((mblk_t *)mp)->b_wptr += print_len;
255 	} else {
256 		((mblk_t *)mp)->b_wptr += buf_len;
257 	}
258 }
259 
260 
261 /*
262  * Initialize the ire that is specific to IPv6 part and call
263  * ire_init_common to finish it.
264  */
265 static ire_t *
266 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
267     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
268     uint_t *max_fragp, queue_t *rfq, queue_t *stq, ushort_t type,
269     ipif_t *ipif, const in6_addr_t *v6cmask, uint32_t phandle,
270     uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
271     tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
272 {
273 
274 	/*
275 	 * Reject IRE security attribute creation/initialization
276 	 * if system is not running in Trusted mode.
277 	 */
278 	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
279 		return (NULL);
280 
281 
282 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
283 	ire->ire_addr_v6 = *v6addr;
284 
285 	if (v6src_addr != NULL)
286 		ire->ire_src_addr_v6 = *v6src_addr;
287 	if (v6mask != NULL) {
288 		ire->ire_mask_v6 = *v6mask;
289 		ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
290 	}
291 	if (v6gateway != NULL)
292 		ire->ire_gateway_addr_v6 = *v6gateway;
293 
294 	if (type == IRE_CACHE && v6cmask != NULL)
295 		ire->ire_cmask_v6 = *v6cmask;
296 
297 	/*
298 	 * Multirouted packets need to have a fragment header added so that
299 	 * the receiver is able to discard duplicates according to their
300 	 * fragment identifier.
301 	 */
302 	if (type == IRE_CACHE && (flags & RTF_MULTIRT)) {
303 		ire->ire_frag_flag = IPH_FRAG_HDR;
304 	}
305 
306 	/* ire_init_common will free the mblks upon encountering any failure */
307 	if (!ire_init_common(ire, max_fragp, NULL, rfq, stq, type, ipif,
308 	    phandle, ihandle, flags, IPV6_VERSION, ulp_info, gc, gcgrp, ipst))
309 		return (NULL);
310 
311 	return (ire);
312 }
313 
314 /*
315  * Similar to ire_create_v6 except that it is called only when
316  * we want to allocate ire as an mblk e.g. we have a external
317  * resolver. Do we need this in IPv6 ?
318  *
319  * IPv6 initializes the ire_nce in ire_add_v6, which expects to
320  * find the ire_nce to be null when it is called. So, although
321  * we have a src_nce parameter (in the interest of matching up with
322  * the argument list of the v4 version), we ignore the src_nce
323  * argument here.
324  */
325 /* ARGSUSED */
326 ire_t *
327 ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
328     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
329     nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
330     ipif_t *ipif, const in6_addr_t *v6cmask,
331     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
332     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
333 {
334 	ire_t	*ire;
335 	ire_t	*ret_ire;
336 	mblk_t	*mp;
337 
338 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
339 
340 	/* Allocate the new IRE. */
341 	mp = allocb(sizeof (ire_t), BPRI_MED);
342 	if (mp == NULL) {
343 		ip1dbg(("ire_create_mp_v6: alloc failed\n"));
344 		return (NULL);
345 	}
346 
347 	ire = (ire_t *)mp->b_rptr;
348 	mp->b_wptr = (uchar_t *)&ire[1];
349 
350 	/* Start clean. */
351 	*ire = ire_null;
352 	ire->ire_mp = mp;
353 	mp->b_datap->db_type = IRE_DB_TYPE;
354 
355 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
356 	    NULL, rfq, stq, type, ipif, v6cmask, phandle,
357 	    ihandle, flags, ulp_info, gc, gcgrp, ipst);
358 
359 	if (ret_ire == NULL) {
360 		freeb(ire->ire_mp);
361 		return (NULL);
362 	}
363 	return (ire);
364 }
365 
366 /*
367  * ire_create_v6 is called to allocate and initialize a new IRE.
368  *
369  * NOTE : This is called as writer sometimes though not required
370  * by this function.
371  *
372  * See comments above ire_create_mp_v6() for the rationale behind the
373  * unused src_nce argument.
374  */
375 /* ARGSUSED */
376 ire_t *
377 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
378     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
379     uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
380     ushort_t type, ipif_t *ipif, const in6_addr_t *v6cmask,
381     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
382     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
383 {
384 	ire_t	*ire;
385 	ire_t	*ret_ire;
386 
387 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
388 
389 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
390 	if (ire == NULL) {
391 		ip1dbg(("ire_create_v6: alloc failed\n"));
392 		return (NULL);
393 	}
394 	*ire = ire_null;
395 
396 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
397 	    max_fragp, rfq, stq, type, ipif, v6cmask, phandle,
398 	    ihandle, flags, ulp_info, gc, gcgrp, ipst);
399 
400 	if (ret_ire == NULL) {
401 		kmem_cache_free(ire_cache, ire);
402 		return (NULL);
403 	}
404 	ASSERT(ret_ire == ire);
405 	return (ire);
406 }
407 
408 /*
409  * Find an IRE_INTERFACE for the multicast group.
410  * Allows different routes for multicast addresses
411  * in the unicast routing table (akin to FF::0/8 but could be more specific)
412  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
413  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
414  * specify the interface to join on.
415  *
416  * Supports link-local addresses by following the ipif/ill when recursing.
417  */
418 ire_t *
419 ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
420 {
421 	ire_t	*ire;
422 	ipif_t	*ipif = NULL;
423 	int	match_flags = MATCH_IRE_TYPE;
424 	in6_addr_t gw_addr_v6;
425 
426 	ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL,
427 	    zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst);
428 
429 	/* We search a resolvable ire in case of multirouting. */
430 	if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
431 		ire_t *cire = NULL;
432 		/*
433 		 * If the route is not resolvable, the looked up ire
434 		 * may be changed here. In that case, ire_multirt_lookup()
435 		 * IRE_REFRELE the original ire and change it.
436 		 */
437 		(void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW,
438 		    NULL, ipst);
439 		if (cire != NULL)
440 			ire_refrele(cire);
441 	}
442 	if (ire == NULL)
443 		return (NULL);
444 	/*
445 	 * Make sure we follow ire_ipif.
446 	 *
447 	 * We need to determine the interface route through
448 	 * which the gateway will be reached. We don't really
449 	 * care which interface is picked if the interface is
450 	 * part of a group.
451 	 */
452 	if (ire->ire_ipif != NULL) {
453 		ipif = ire->ire_ipif;
454 		match_flags |= MATCH_IRE_ILL_GROUP;
455 	}
456 
457 	switch (ire->ire_type) {
458 	case IRE_DEFAULT:
459 	case IRE_PREFIX:
460 	case IRE_HOST:
461 		mutex_enter(&ire->ire_lock);
462 		gw_addr_v6 = ire->ire_gateway_addr_v6;
463 		mutex_exit(&ire->ire_lock);
464 		ire_refrele(ire);
465 		ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0,
466 		    IRE_INTERFACE, ipif, NULL, zoneid, 0,
467 		    NULL, match_flags, ipst);
468 		return (ire);
469 	case IRE_IF_NORESOLVER:
470 	case IRE_IF_RESOLVER:
471 		return (ire);
472 	default:
473 		ire_refrele(ire);
474 		return (NULL);
475 	}
476 }
477 
478 /*
479  * Return any local address.  We use this to target ourselves
480  * when the src address was specified as 'default'.
481  * Preference for IRE_LOCAL entries.
482  */
483 ire_t *
484 ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst)
485 {
486 	ire_t	*ire;
487 	irb_t	*irb;
488 	ire_t	*maybe = NULL;
489 	int i;
490 
491 	for (i = 0; i < ipst->ips_ip6_cache_table_size;  i++) {
492 		irb = &ipst->ips_ip_cache_table_v6[i];
493 		if (irb->irb_ire == NULL)
494 			continue;
495 		rw_enter(&irb->irb_lock, RW_READER);
496 		for (ire = irb->irb_ire; ire; ire = ire->ire_next) {
497 			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
498 			    ire->ire_zoneid != zoneid &&
499 			    ire->ire_zoneid != ALL_ZONES)
500 				continue;
501 			switch (ire->ire_type) {
502 			case IRE_LOOPBACK:
503 				if (maybe == NULL) {
504 					IRE_REFHOLD(ire);
505 					maybe = ire;
506 				}
507 				break;
508 			case IRE_LOCAL:
509 				if (maybe != NULL) {
510 					ire_refrele(maybe);
511 				}
512 				IRE_REFHOLD(ire);
513 				rw_exit(&irb->irb_lock);
514 				return (ire);
515 			}
516 		}
517 		rw_exit(&irb->irb_lock);
518 	}
519 	return (maybe);
520 }
521 
522 /*
523  * This function takes a mask and returns number of bits set in the
524  * mask (the represented prefix length).  Assumes a contiguous mask.
525  */
526 int
527 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
528 {
529 	int		bits;
530 	int		plen = IPV6_ABITS;
531 	int		i;
532 
533 	for (i = 3; i >= 0; i--) {
534 		if (v6mask->s6_addr32[i] == 0) {
535 			plen -= 32;
536 			continue;
537 		}
538 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
539 		if (bits == 0)
540 			break;
541 		plen -= bits;
542 	}
543 
544 	return (plen);
545 }
546 
547 /*
548  * Convert a prefix length to the mask for that prefix.
549  * Returns the argument bitmask.
550  */
551 in6_addr_t *
552 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
553 {
554 	uint32_t *ptr;
555 
556 	if (plen < 0 || plen > IPV6_ABITS)
557 		return (NULL);
558 	*bitmask = ipv6_all_zeros;
559 
560 	ptr = (uint32_t *)bitmask;
561 	while (plen > 32) {
562 		*ptr++ = 0xffffffffU;
563 		plen -= 32;
564 	}
565 	*ptr = htonl(0xffffffffU << (32 - plen));
566 	return (bitmask);
567 }
568 
569 /*
570  * Add a fully initialized IRE to an appropriate
571  * table based on ire_type.
572  *
573  * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and
574  * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
575  *
576  * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
577  * and IRE_CACHE.
578  *
579  * NOTE : This function is called as writer though not required
580  * by this function.
581  */
582 int
583 ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
584 {
585 	ire_t	*ire1;
586 	int	mask_table_index;
587 	irb_t	*irb_ptr;
588 	ire_t	**irep;
589 	int	flags;
590 	ire_t	*pire = NULL;
591 	ill_t	*stq_ill;
592 	boolean_t	ndp_g_lock_held = B_FALSE;
593 	ire_t	*ire = *ire_p;
594 	int	error;
595 	ip_stack_t	*ipst = ire->ire_ipst;
596 
597 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
598 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
599 	ASSERT(ire->ire_nce == NULL);
600 
601 	/* Find the appropriate list head. */
602 	switch (ire->ire_type) {
603 	case IRE_HOST:
604 		ire->ire_mask_v6 = ipv6_all_ones;
605 		ire->ire_masklen = IPV6_ABITS;
606 		if ((ire->ire_flags & RTF_SETSRC) == 0)
607 			ire->ire_src_addr_v6 = ipv6_all_zeros;
608 		break;
609 	case IRE_CACHE:
610 	case IRE_LOCAL:
611 	case IRE_LOOPBACK:
612 		ire->ire_mask_v6 = ipv6_all_ones;
613 		ire->ire_masklen = IPV6_ABITS;
614 		break;
615 	case IRE_PREFIX:
616 		if ((ire->ire_flags & RTF_SETSRC) == 0)
617 			ire->ire_src_addr_v6 = ipv6_all_zeros;
618 		break;
619 	case IRE_DEFAULT:
620 		if ((ire->ire_flags & RTF_SETSRC) == 0)
621 			ire->ire_src_addr_v6 = ipv6_all_zeros;
622 		break;
623 	case IRE_IF_RESOLVER:
624 	case IRE_IF_NORESOLVER:
625 		break;
626 	default:
627 		printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
628 		    (void *)ire, ire->ire_type);
629 		ire_delete(ire);
630 		*ire_p = NULL;
631 		return (EINVAL);
632 	}
633 
634 	/* Make sure the address is properly masked. */
635 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
636 
637 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
638 		/* IRE goes into Forward Table */
639 		mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
640 		if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) ==
641 		    NULL) {
642 			irb_t *ptr;
643 			int i;
644 
645 			ptr = (irb_t *)mi_zalloc((
646 			    ipst->ips_ip6_ftable_hash_size * sizeof (irb_t)));
647 			if (ptr == NULL) {
648 				ire_delete(ire);
649 				*ire_p = NULL;
650 				return (ENOMEM);
651 			}
652 			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
653 				rw_init(&ptr[i].irb_lock, NULL,
654 				    RW_DEFAULT, NULL);
655 			}
656 			mutex_enter(&ipst->ips_ire_ft_init_lock);
657 			if (ipst->ips_ip_forwarding_table_v6[
658 			    mask_table_index] == NULL) {
659 				ipst->ips_ip_forwarding_table_v6[
660 				    mask_table_index] = ptr;
661 				mutex_exit(&ipst->ips_ire_ft_init_lock);
662 			} else {
663 				/*
664 				 * Some other thread won the race in
665 				 * initializing the forwarding table at the
666 				 * same index.
667 				 */
668 				mutex_exit(&ipst->ips_ire_ft_init_lock);
669 				for (i = 0; i < ipst->ips_ip6_ftable_hash_size;
670 				    i++) {
671 					rw_destroy(&ptr[i].irb_lock);
672 				}
673 				mi_free(ptr);
674 			}
675 		}
676 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
677 		    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
678 		    ipst->ips_ip6_ftable_hash_size)]);
679 	} else {
680 		irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
681 		    ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]);
682 	}
683 	/*
684 	 * For xresolv interfaces (v6 interfaces with an external
685 	 * address resolver), ip_newroute_v6/ip_newroute_ipif_v6
686 	 * are unable to prevent the deletion of the interface route
687 	 * while adding an IRE_CACHE for an on-link destination
688 	 * in the IRE_IF_RESOLVER case, since the ire has to go to
689 	 * the external resolver and return. We can't do a REFHOLD on the
690 	 * associated interface ire for fear of the message being freed
691 	 * if the external resolver can't resolve the address.
692 	 * Here we look up the interface ire in the forwarding table
693 	 * and make sure that the interface route has not been deleted.
694 	 */
695 	if (ire->ire_type == IRE_CACHE &&
696 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) &&
697 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) &&
698 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) {
699 
700 		pire = ire_ihandle_lookup_onlink_v6(ire);
701 		if (pire == NULL) {
702 			ire_delete(ire);
703 			*ire_p = NULL;
704 			return (EINVAL);
705 		}
706 		/* Prevent pire from getting deleted */
707 		IRB_REFHOLD(pire->ire_bucket);
708 		/* Has it been removed already? */
709 		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
710 			IRB_REFRELE(pire->ire_bucket);
711 			ire_refrele(pire);
712 			ire_delete(ire);
713 			*ire_p = NULL;
714 			return (EINVAL);
715 		}
716 	}
717 
718 	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
719 	/*
720 	 * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check
721 	 * for duplicates because :
722 	 *
723 	 * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be
724 	 *    pointing at different ills. A real duplicate is
725 	 *    a match on both ire_ipif and ire_stq.
726 	 *
727 	 * 2) We could have multiple packets trying to create
728 	 *    an IRE_CACHE for the same ill.
729 	 *
730 	 * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants
731 	 * to go out on a particular ill. Rather than looking at the
732 	 * packet, we depend on the above for MATCH_IRE_ILL here.
733 	 *
734 	 * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
735 	 * multiple IRE_CACHES for an ill for the same destination
736 	 * with various scoped addresses i.e represented by ipifs.
737 	 *
738 	 * MATCH_IRE_ILL is done implicitly below for IRE_CACHES.
739 	 */
740 	if (ire->ire_ipif != NULL)
741 		flags |= MATCH_IRE_IPIF;
742 	/*
743 	 * If we are creating hidden ires, make sure we search on
744 	 * this ill (MATCH_IRE_ILL) and a hidden ire, while we are
745 	 * searching for duplicates below. Otherwise we could
746 	 * potentially find an IRE on some other interface
747 	 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
748 	 * shouldn't do this as this will lead to an infinite loop as
749 	 * eventually we need an hidden ire for this packet to go
750 	 * out. MATCH_IRE_ILL is already marked above.
751 	 */
752 	if (ire->ire_marks & IRE_MARK_HIDDEN) {
753 		ASSERT(ire->ire_type == IRE_CACHE);
754 		flags |= MATCH_IRE_MARK_HIDDEN;
755 	}
756 
757 	/*
758 	 * Start the atomic add of the ire. Grab the ill locks,
759 	 * ill_g_usesrc_lock and the bucket lock. Check for condemned.
760 	 * To avoid lock order problems, get the ndp6.ndp_g_lock now itself.
761 	 */
762 	if (ire->ire_type == IRE_CACHE) {
763 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
764 		ndp_g_lock_held = B_TRUE;
765 	}
766 
767 	/*
768 	 * If ipif or ill is changing ire_atomic_start() may queue the
769 	 * request and return EINPROGRESS.
770 	 */
771 
772 	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
773 	if (error != 0) {
774 		if (ndp_g_lock_held)
775 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
776 		/*
777 		 * We don't know whether it is a valid ipif or not.
778 		 * So, set it to NULL. This assumes that the ire has not added
779 		 * a reference to the ipif.
780 		 */
781 		ire->ire_ipif = NULL;
782 		ire_delete(ire);
783 		if (pire != NULL) {
784 			IRB_REFRELE(pire->ire_bucket);
785 			ire_refrele(pire);
786 		}
787 		*ire_p = NULL;
788 		return (error);
789 	}
790 	/*
791 	 * To avoid creating ires having stale values for the ire_max_frag
792 	 * we get the latest value atomically here. For more details
793 	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
794 	 * in ip_rput_dlpi_writer
795 	 */
796 	if (ire->ire_max_fragp == NULL) {
797 		if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6))
798 			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
799 		else
800 			ire->ire_max_frag = pire->ire_max_frag;
801 	} else {
802 		uint_t  max_frag;
803 
804 		max_frag = *ire->ire_max_fragp;
805 		ire->ire_max_fragp = NULL;
806 		ire->ire_max_frag = max_frag;
807 	}
808 
809 	/*
810 	 * Atomically check for duplicate and insert in the table.
811 	 */
812 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
813 		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
814 			continue;
815 
816 		if (ire->ire_type == IRE_CACHE) {
817 			/*
818 			 * We do MATCH_IRE_ILL implicitly here for IRE_CACHES.
819 			 * As ire_ipif and ire_stq could point to two
820 			 * different ills, we can't pass just ire_ipif to
821 			 * ire_match_args and get a match on both ills.
822 			 * This is just needed for duplicate checks here and
823 			 * so we don't add an extra argument to
824 			 * ire_match_args for this. Do it locally.
825 			 *
826 			 * NOTE : Currently there is no part of the code
827 			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
828 			 * match for IRE_CACHEs. Thus we don't want to
829 			 * extend the arguments to ire_match_args_v6.
830 			 */
831 			if (ire1->ire_stq != ire->ire_stq)
832 				continue;
833 			/*
834 			 * Multiroute IRE_CACHEs for a given destination can
835 			 * have the same ire_ipif, typically if their source
836 			 * address is forced using RTF_SETSRC, and the same
837 			 * send-to queue. We differentiate them using the parent
838 			 * handle.
839 			 */
840 			if ((ire1->ire_flags & RTF_MULTIRT) &&
841 			    (ire->ire_flags & RTF_MULTIRT) &&
842 			    (ire1->ire_phandle != ire->ire_phandle))
843 				continue;
844 		}
845 		if (ire1->ire_zoneid != ire->ire_zoneid)
846 			continue;
847 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
848 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
849 		    ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL,
850 		    flags)) {
851 			/*
852 			 * Return the old ire after doing a REFHOLD.
853 			 * As most of the callers continue to use the IRE
854 			 * after adding, we return a held ire. This will
855 			 * avoid a lookup in the caller again. If the callers
856 			 * don't want to use it, they need to do a REFRELE.
857 			 */
858 			ip1dbg(("found dup ire existing %p new %p",
859 			    (void *)ire1, (void *)ire));
860 			IRE_REFHOLD(ire1);
861 			if (ndp_g_lock_held)
862 				mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
863 			ire_atomic_end(irb_ptr, ire);
864 			ire_delete(ire);
865 			if (pire != NULL) {
866 				/*
867 				 * Assert that it is
868 				 * not yet removed from the list.
869 				 */
870 				ASSERT(pire->ire_ptpn != NULL);
871 				IRB_REFRELE(pire->ire_bucket);
872 				ire_refrele(pire);
873 			}
874 			*ire_p = ire1;
875 			return (0);
876 		}
877 	}
878 	if (ire->ire_type == IRE_CACHE) {
879 		in6_addr_t gw_addr_v6;
880 		ill_t	*ill = ire_to_ill(ire);
881 		char	buf[INET6_ADDRSTRLEN];
882 		nce_t	*nce;
883 
884 		/*
885 		 * All IRE_CACHE types must have a nce.  If this is
886 		 * not the case the entry will not be added. We need
887 		 * to make sure that if somebody deletes the nce
888 		 * after we looked up, they will find this ire and
889 		 * delete the ire. To delete this ire one needs the
890 		 * bucket lock which we are still holding here. So,
891 		 * even if the nce gets deleted after we looked up,
892 		 * this ire  will get deleted.
893 		 *
894 		 * NOTE : Don't need the ire_lock for accessing
895 		 * ire_gateway_addr_v6 as it is appearing first
896 		 * time on the list and rts_setgwr_v6 could not
897 		 * be changing this.
898 		 */
899 		gw_addr_v6 = ire->ire_gateway_addr_v6;
900 		if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
901 			nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE);
902 		} else {
903 			nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE);
904 		}
905 		if (nce == NULL)
906 			goto failed;
907 
908 		/* Pair of refhold, refrele just to get the tracing right */
909 		NCE_REFHOLD_TO_REFHOLD_NOTR(nce);
910 		/*
911 		 * Atomically make sure that new IREs don't point
912 		 * to an NCE that is logically deleted (CONDEMNED).
913 		 * ndp_delete() first marks the NCE CONDEMNED.
914 		 * This ensures that the nce_refcnt won't increase
915 		 * due to new nce_lookups or due to addition of new IREs
916 		 * pointing to this NCE. Then ndp_delete() cleans up
917 		 * existing references. If we don't do it atomically here,
918 		 * ndp_delete() -> nce_ire_delete() will not be able to
919 		 * clean up the IRE list completely, and the nce_refcnt
920 		 * won't go down to zero.
921 		 */
922 		mutex_enter(&nce->nce_lock);
923 		if (ill->ill_flags & ILLF_XRESOLV) {
924 			/*
925 			 * If we used an external resolver, we may not
926 			 * have gone through neighbor discovery to get here.
927 			 * Must update the nce_state before the next check.
928 			 */
929 			if (nce->nce_state == ND_INCOMPLETE)
930 				nce->nce_state = ND_REACHABLE;
931 		}
932 		if (nce->nce_state == ND_INCOMPLETE ||
933 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
934 		    (nce->nce_state == ND_UNREACHABLE)) {
935 failed:
936 			if (ndp_g_lock_held)
937 				mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
938 			if (nce != NULL)
939 				mutex_exit(&nce->nce_lock);
940 			ire_atomic_end(irb_ptr, ire);
941 			ip1dbg(("ire_add_v6: No nce for dst %s \n",
942 			    inet_ntop(AF_INET6, &ire->ire_addr_v6,
943 			    buf, sizeof (buf))));
944 			ire_delete(ire);
945 			if (pire != NULL) {
946 				/*
947 				 * Assert that it is
948 				 * not yet removed from the list.
949 				 */
950 				ASSERT(pire->ire_ptpn != NULL);
951 				IRB_REFRELE(pire->ire_bucket);
952 				ire_refrele(pire);
953 			}
954 			if (nce != NULL)
955 				NCE_REFRELE_NOTR(nce);
956 			*ire_p = NULL;
957 			return (EINVAL);
958 		} else {
959 			ire->ire_nce = nce;
960 		}
961 		mutex_exit(&nce->nce_lock);
962 	}
963 	/*
964 	 * Find the first entry that matches ire_addr - provides
965 	 * tail insertion. *irep will be null if no match.
966 	 */
967 	irep = (ire_t **)irb_ptr;
968 	while ((ire1 = *irep) != NULL &&
969 	    !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
970 		irep = &ire1->ire_next;
971 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
972 
973 	if (*irep != NULL) {
974 		/*
975 		 * Find the last ire which matches ire_addr_v6.
976 		 * Needed to do tail insertion among entries with the same
977 		 * ire_addr_v6.
978 		 */
979 		while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
980 		    &ire1->ire_addr_v6)) {
981 			irep = &ire1->ire_next;
982 			ire1 = *irep;
983 			if (ire1 == NULL)
984 				break;
985 		}
986 	}
987 
988 	if (ire->ire_type == IRE_DEFAULT) {
989 		/*
990 		 * We keep a count of default gateways which is used when
991 		 * assigning them as routes.
992 		 */
993 		ipst->ips_ipv6_ire_default_count++;
994 		ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */
995 	}
996 	/* Insert at *irep */
997 	ire1 = *irep;
998 	if (ire1 != NULL)
999 		ire1->ire_ptpn = &ire->ire_next;
1000 	ire->ire_next = ire1;
1001 	/* Link the new one in. */
1002 	ire->ire_ptpn = irep;
1003 	/*
1004 	 * ire_walk routines de-reference ire_next without holding
1005 	 * a lock. Before we point to the new ire, we want to make
1006 	 * sure the store that sets the ire_next of the new ire
1007 	 * reaches global visibility, so that ire_walk routines
1008 	 * don't see a truncated list of ires i.e if the ire_next
1009 	 * of the new ire gets set after we do "*irep = ire" due
1010 	 * to re-ordering, the ire_walk thread will see a NULL
1011 	 * once it accesses the ire_next of the new ire.
1012 	 * membar_producer() makes sure that the following store
1013 	 * happens *after* all of the above stores.
1014 	 */
1015 	membar_producer();
1016 	*irep = ire;
1017 	ire->ire_bucket = irb_ptr;
1018 	/*
1019 	 * We return a bumped up IRE above. Keep it symmetrical
1020 	 * so that the callers will always have to release. This
1021 	 * helps the callers of this function because they continue
1022 	 * to use the IRE after adding and hence they don't have to
1023 	 * lookup again after we return the IRE.
1024 	 *
1025 	 * NOTE : We don't have to use atomics as this is appearing
1026 	 * in the list for the first time and no one else can bump
1027 	 * up the reference count on this yet.
1028 	 */
1029 	IRE_REFHOLD_LOCKED(ire);
1030 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
1031 	irb_ptr->irb_ire_cnt++;
1032 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
1033 		irb_ptr->irb_tmp_ire_cnt++;
1034 
1035 	if (ire->ire_ipif != NULL) {
1036 		ire->ire_ipif->ipif_ire_cnt++;
1037 		if (ire->ire_stq != NULL) {
1038 			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
1039 			stq_ill->ill_ire_cnt++;
1040 		}
1041 	} else {
1042 		ASSERT(ire->ire_stq == NULL);
1043 	}
1044 
1045 	if (ndp_g_lock_held)
1046 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1047 	ire_atomic_end(irb_ptr, ire);
1048 
1049 	if (pire != NULL) {
1050 		/* Assert that it is not removed from the list yet */
1051 		ASSERT(pire->ire_ptpn != NULL);
1052 		IRB_REFRELE(pire->ire_bucket);
1053 		ire_refrele(pire);
1054 	}
1055 
1056 	if (ire->ire_type != IRE_CACHE) {
1057 		/*
1058 		 * For ire's with with host mask see if there is an entry
1059 		 * in the cache. If there is one flush the whole cache as
1060 		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
1061 		 * If no entry is found than there is no need to flush the
1062 		 * cache.
1063 		 */
1064 
1065 		if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) {
1066 			ire_t *lire;
1067 			lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL,
1068 			    IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
1069 			    ipst);
1070 			if (lire != NULL) {
1071 				ire_refrele(lire);
1072 				ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
1073 			}
1074 		} else {
1075 			ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
1076 		}
1077 	}
1078 
1079 	*ire_p = ire;
1080 	return (0);
1081 }
1082 
1083 /*
1084  * Search for all HOST REDIRECT routes that are
1085  * pointing at the specified gateway and
1086  * delete them. This routine is called only
1087  * when a default gateway is going away.
1088  */
1089 static void
1090 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
1091 {
1092 	irb_t *irb_ptr;
1093 	irb_t *irb;
1094 	ire_t *ire;
1095 	in6_addr_t gw_addr_v6;
1096 	int i;
1097 
1098 	/* get the hash table for HOST routes */
1099 	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
1100 	if (irb_ptr == NULL)
1101 		return;
1102 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
1103 		irb = &irb_ptr[i];
1104 		IRB_REFHOLD(irb);
1105 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
1106 			if (!(ire->ire_flags & RTF_DYNAMIC))
1107 				continue;
1108 			mutex_enter(&ire->ire_lock);
1109 			gw_addr_v6 = ire->ire_gateway_addr_v6;
1110 			mutex_exit(&ire->ire_lock);
1111 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
1112 				ire_delete(ire);
1113 		}
1114 		IRB_REFRELE(irb);
1115 	}
1116 }
1117 
1118 /*
1119  * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart
1120  * of ip_ire_clookup_and_delete. The difference being this function does not
1121  * return any value. IPv6 processing of a gratuitous ARP, as it stands, is
1122  * different than IPv4 in that, regardless of the presence of a cache entry
1123  * for this address, an ire_walk_v6 is done. Another difference is that unlike
1124  * in the case of IPv4 this does not take an ipif_t argument, since it is only
1125  * called by ip_arp_news and the match is always only on the address.
1126  */
1127 void
1128 ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst)
1129 {
1130 	irb_t		*irb;
1131 	ire_t		*cire;
1132 	boolean_t	found = B_FALSE;
1133 
1134 	irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
1135 	    ipst->ips_ip6_cache_table_size)];
1136 	IRB_REFHOLD(irb);
1137 	for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) {
1138 		if (cire->ire_marks & IRE_MARK_CONDEMNED)
1139 			continue;
1140 		if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) {
1141 
1142 			/* This signifies start of a match */
1143 			if (!found)
1144 				found = B_TRUE;
1145 			if (cire->ire_type == IRE_CACHE) {
1146 				if (cire->ire_nce != NULL)
1147 					ndp_delete(cire->ire_nce);
1148 				ire_delete_v6(cire);
1149 			}
1150 		/* End of the match */
1151 		} else if (found)
1152 			break;
1153 	}
1154 	IRB_REFRELE(irb);
1155 }
1156 
1157 /*
1158  * Delete the specified IRE.
1159  * All calls should use ire_delete().
1160  * Sometimes called as writer though not required by this function.
1161  *
1162  * NOTE : This function is called only if the ire was added
1163  * in the list.
1164  */
1165 void
1166 ire_delete_v6(ire_t *ire)
1167 {
1168 	in6_addr_t gw_addr_v6;
1169 	ip_stack_t	*ipst = ire->ire_ipst;
1170 
1171 	ASSERT(ire->ire_refcnt >= 1);
1172 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
1173 
1174 	if (ire->ire_type != IRE_CACHE)
1175 		ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
1176 	if (ire->ire_type == IRE_DEFAULT) {
1177 		/*
1178 		 * when a default gateway is going away
1179 		 * delete all the host redirects pointing at that
1180 		 * gateway.
1181 		 */
1182 		mutex_enter(&ire->ire_lock);
1183 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1184 		mutex_exit(&ire->ire_lock);
1185 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
1186 	}
1187 }
1188 
1189 /*
1190  * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect
1191  * entries.
1192  */
1193 /*ARGSUSED1*/
1194 void
1195 ire_delete_cache_v6(ire_t *ire, char *arg)
1196 {
1197 	char    addrstr1[INET6_ADDRSTRLEN];
1198 	char    addrstr2[INET6_ADDRSTRLEN];
1199 
1200 	if ((ire->ire_type & IRE_CACHE) ||
1201 	    (ire->ire_flags & RTF_DYNAMIC)) {
1202 		ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n",
1203 		    inet_ntop(AF_INET6, &ire->ire_addr_v6,
1204 		    addrstr1, sizeof (addrstr1)),
1205 		    ire->ire_type,
1206 		    inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6,
1207 		    addrstr2, sizeof (addrstr2))));
1208 		ire_delete(ire);
1209 	}
1210 
1211 }
1212 
1213 /*
1214  * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries
1215  * that have a given gateway address.
1216  */
1217 void
1218 ire_delete_cache_gw_v6(ire_t *ire, char *addr)
1219 {
1220 	in6_addr_t	*gw_addr = (in6_addr_t *)addr;
1221 	char		buf1[INET6_ADDRSTRLEN];
1222 	char		buf2[INET6_ADDRSTRLEN];
1223 	in6_addr_t	ire_gw_addr_v6;
1224 
1225 	if (!(ire->ire_type & IRE_CACHE) &&
1226 	    !(ire->ire_flags & RTF_DYNAMIC))
1227 		return;
1228 
1229 	mutex_enter(&ire->ire_lock);
1230 	ire_gw_addr_v6 = ire->ire_gateway_addr_v6;
1231 	mutex_exit(&ire->ire_lock);
1232 
1233 	if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) {
1234 		ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n",
1235 		    inet_ntop(AF_INET6, &ire->ire_src_addr_v6,
1236 		    buf1, sizeof (buf1)),
1237 		    ire->ire_type,
1238 		    inet_ntop(AF_INET6, &ire_gw_addr_v6,
1239 		    buf2, sizeof (buf2))));
1240 		ire_delete(ire);
1241 	}
1242 }
1243 
1244 /*
1245  * Remove all IRE_CACHE entries that match
1246  * the ire specified.  (Sometimes called
1247  * as writer though not required by this function.)
1248  *
1249  * The flag argument indicates if the
1250  * flush request is due to addition
1251  * of new route (IRE_FLUSH_ADD) or deletion of old
1252  * route (IRE_FLUSH_DELETE).
1253  *
1254  * This routine takes only the IREs from the forwarding
1255  * table and flushes the corresponding entries from
1256  * the cache table.
1257  *
1258  * When flushing due to the deletion of an old route, it
1259  * just checks the cache handles (ire_phandle and ire_ihandle) and
1260  * deletes the ones that match.
1261  *
1262  * When flushing due to the creation of a new route, it checks
1263  * if a cache entry's address matches the one in the IRE and
1264  * that the cache entry's parent has a less specific mask than the
1265  * one in IRE. The destination of such a cache entry could be the
1266  * gateway for other cache entries, so we need to flush those as
1267  * well by looking for gateway addresses matching the IRE's address.
1268  */
1269 void
1270 ire_flush_cache_v6(ire_t *ire, int flag)
1271 {
1272 	int i;
1273 	ire_t *cire;
1274 	irb_t *irb;
1275 	ip_stack_t	*ipst = ire->ire_ipst;
1276 
1277 	if (ire->ire_type & IRE_CACHE)
1278 		return;
1279 
1280 	/*
1281 	 * If a default is just created, there is no point
1282 	 * in going through the cache, as there will not be any
1283 	 * cached ires.
1284 	 */
1285 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
1286 		return;
1287 	if (flag == IRE_FLUSH_ADD) {
1288 		/*
1289 		 * This selective flush is
1290 		 * due to the addition of
1291 		 * new IRE.
1292 		 */
1293 		for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
1294 			irb = &ipst->ips_ip_cache_table_v6[i];
1295 			if ((cire = irb->irb_ire) == NULL)
1296 				continue;
1297 			IRB_REFHOLD(irb);
1298 			for (cire = irb->irb_ire; cire != NULL;
1299 			    cire = cire->ire_next) {
1300 				if (cire->ire_type != IRE_CACHE)
1301 					continue;
1302 				/*
1303 				 * If 'cire' belongs to the same subnet
1304 				 * as the new ire being added, and 'cire'
1305 				 * is derived from a prefix that is less
1306 				 * specific than the new ire being added,
1307 				 * we need to flush 'cire'; for instance,
1308 				 * when a new interface comes up.
1309 				 */
1310 				if ((V6_MASK_EQ_2(cire->ire_addr_v6,
1311 				    ire->ire_mask_v6, ire->ire_addr_v6) &&
1312 				    (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <=
1313 				    ire->ire_masklen))) {
1314 					ire_delete(cire);
1315 					continue;
1316 				}
1317 				/*
1318 				 * This is the case when the ire_gateway_addr
1319 				 * of 'cire' belongs to the same subnet as
1320 				 * the new ire being added.
1321 				 * Flushing such ires is sometimes required to
1322 				 * avoid misrouting: say we have a machine with
1323 				 * two interfaces (I1 and I2), a default router
1324 				 * R on the I1 subnet, and a host route to an
1325 				 * off-link destination D with a gateway G on
1326 				 * the I2 subnet.
1327 				 * Under normal operation, we will have an
1328 				 * on-link cache entry for G and an off-link
1329 				 * cache entry for D with G as ire_gateway_addr,
1330 				 * traffic to D will reach its destination
1331 				 * through gateway G.
1332 				 * If the administrator does 'ifconfig I2 down',
1333 				 * the cache entries for D and G will be
1334 				 * flushed. However, G will now be resolved as
1335 				 * an off-link destination using R (the default
1336 				 * router) as gateway. Then D will also be
1337 				 * resolved as an off-link destination using G
1338 				 * as gateway - this behavior is due to
1339 				 * compatibility reasons, see comment in
1340 				 * ire_ihandle_lookup_offlink(). Traffic to D
1341 				 * will go to the router R and probably won't
1342 				 * reach the destination.
1343 				 * The administrator then does 'ifconfig I2 up'.
1344 				 * Since G is on the I2 subnet, this routine
1345 				 * will flush its cache entry. It must also
1346 				 * flush the cache entry for D, otherwise
1347 				 * traffic will stay misrouted until the IRE
1348 				 * times out.
1349 				 */
1350 				if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6,
1351 				    ire->ire_mask_v6, ire->ire_addr_v6)) {
1352 					ire_delete(cire);
1353 					continue;
1354 				}
1355 			}
1356 			IRB_REFRELE(irb);
1357 		}
1358 	} else {
1359 		/*
1360 		 * delete the cache entries based on
1361 		 * handle in the IRE as this IRE is
1362 		 * being deleted/changed.
1363 		 */
1364 		for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
1365 			irb = &ipst->ips_ip_cache_table_v6[i];
1366 			if ((cire = irb->irb_ire) == NULL)
1367 				continue;
1368 			IRB_REFHOLD(irb);
1369 			for (cire = irb->irb_ire; cire != NULL;
1370 			    cire = cire->ire_next) {
1371 				if (cire->ire_type != IRE_CACHE)
1372 					continue;
1373 				if ((cire->ire_phandle == 0 ||
1374 				    cire->ire_phandle != ire->ire_phandle) &&
1375 				    (cire->ire_ihandle == 0 ||
1376 				    cire->ire_ihandle != ire->ire_ihandle))
1377 					continue;
1378 				ire_delete(cire);
1379 			}
1380 			IRB_REFRELE(irb);
1381 		}
1382 	}
1383 }
1384 
1385 /*
1386  * Matches the arguments passed with the values in the ire.
1387  *
1388  * Note: for match types that match using "ipif" passed in, ipif
1389  * must be checked for non-NULL before calling this routine.
1390  */
1391 static boolean_t
1392 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
1393     const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid,
1394     uint32_t ihandle, const ts_label_t *tsl, int match_flags)
1395 {
1396 	in6_addr_t masked_addr;
1397 	in6_addr_t gw_addr_v6;
1398 	ill_t *ire_ill = NULL, *dst_ill;
1399 	ill_t *ipif_ill = NULL;
1400 	ill_group_t *ire_ill_group = NULL;
1401 	ill_group_t *ipif_ill_group = NULL;
1402 	ipif_t	*src_ipif;
1403 
1404 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
1405 	ASSERT(addr != NULL);
1406 	ASSERT(mask != NULL);
1407 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
1408 	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
1409 	    (ipif != NULL && ipif->ipif_isv6));
1410 	ASSERT(!(match_flags & MATCH_IRE_WQ));
1411 
1412 	/*
1413 	 * HIDDEN cache entries have to be looked up specifically with
1414 	 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
1415 	 * when the interface is FAILED or INACTIVE. In that case,
1416 	 * any IRE_CACHES that exists should be marked with
1417 	 * IRE_MARK_HIDDEN. So, we don't really need to match below
1418 	 * for IRE_MARK_HIDDEN. But we do so for consistency.
1419 	 */
1420 	if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
1421 	    (ire->ire_marks & IRE_MARK_HIDDEN))
1422 		return (B_FALSE);
1423 
1424 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
1425 	    ire->ire_zoneid != ALL_ZONES) {
1426 		/*
1427 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
1428 		 * valid and does not match that of ire_zoneid, a failure to
1429 		 * match is reported at this point. Otherwise, since some IREs
1430 		 * that are available in the global zone can be used in local
1431 		 * zones, additional checks need to be performed:
1432 		 *
1433 		 *	IRE_CACHE and IRE_LOOPBACK entries should
1434 		 *	never be matched in this situation.
1435 		 *
1436 		 *	IRE entries that have an interface associated with them
1437 		 *	should in general not match unless they are an IRE_LOCAL
1438 		 *	or in the case when MATCH_IRE_DEFAULT has been set in
1439 		 *	the caller.  In the case of the former, checking of the
1440 		 *	other fields supplied should take place.
1441 		 *
1442 		 *	In the case where MATCH_IRE_DEFAULT has been set,
1443 		 *	all of the ipif's associated with the IRE's ill are
1444 		 *	checked to see if there is a matching zoneid.  If any
1445 		 *	one ipif has a matching zoneid, this IRE is a
1446 		 *	potential candidate so checking of the other fields
1447 		 *	takes place.
1448 		 *
1449 		 *	In the case where the IRE_INTERFACE has a usable source
1450 		 *	address (indicated by ill_usesrc_ifindex) in the
1451 		 *	correct zone then it's permitted to return this IRE
1452 		 */
1453 		if (match_flags & MATCH_IRE_ZONEONLY)
1454 			return (B_FALSE);
1455 		if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK))
1456 			return (B_FALSE);
1457 		/*
1458 		 * Note, IRE_INTERFACE can have the stq as NULL. For
1459 		 * example, if the default multicast route is tied to
1460 		 * the loopback address.
1461 		 */
1462 		if ((ire->ire_type & IRE_INTERFACE) &&
1463 		    (ire->ire_stq != NULL)) {
1464 			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
1465 			/*
1466 			 * If there is a usable source address in the
1467 			 * zone, then it's ok to return an
1468 			 * IRE_INTERFACE
1469 			 */
1470 			if ((dst_ill->ill_usesrc_ifindex != 0) &&
1471 			    (src_ipif = ipif_select_source_v6(dst_ill, addr,
1472 			    RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid))
1473 			    != NULL) {
1474 				ip3dbg(("ire_match_args: src_ipif %p"
1475 				    " dst_ill %p", (void *)src_ipif,
1476 				    (void *)dst_ill));
1477 				ipif_refrele(src_ipif);
1478 			} else {
1479 				ip3dbg(("ire_match_args: src_ipif NULL"
1480 				    " dst_ill %p\n", (void *)dst_ill));
1481 				return (B_FALSE);
1482 			}
1483 		}
1484 		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
1485 		    !(ire->ire_type & IRE_INTERFACE)) {
1486 			ipif_t	*tipif;
1487 
1488 			if ((match_flags & MATCH_IRE_DEFAULT) == 0)
1489 				return (B_FALSE);
1490 			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
1491 			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
1492 			    tipif != NULL; tipif = tipif->ipif_next) {
1493 				if (IPIF_CAN_LOOKUP(tipif) &&
1494 				    (tipif->ipif_flags & IPIF_UP) &&
1495 				    (tipif->ipif_zoneid == zoneid ||
1496 				    tipif->ipif_zoneid == ALL_ZONES))
1497 					break;
1498 			}
1499 			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
1500 			if (tipif == NULL)
1501 				return (B_FALSE);
1502 		}
1503 	}
1504 
1505 	if (match_flags & MATCH_IRE_GW) {
1506 		mutex_enter(&ire->ire_lock);
1507 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1508 		mutex_exit(&ire->ire_lock);
1509 	}
1510 	/*
1511 	 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
1512 	 * somebody wants to send out on a particular interface which
1513 	 * is given by ire_stq and hence use ire_stq to derive the ill
1514 	 * value. ire_ipif for IRE_CACHES is just the
1515 	 * means of getting a source address i.e ire_src_addr_v6 =
1516 	 * ire->ire_ipif->ipif_src_addr_v6.
1517 	 */
1518 	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
1519 		ire_ill = ire_to_ill(ire);
1520 		if (ire_ill != NULL)
1521 			ire_ill_group = ire_ill->ill_group;
1522 		ipif_ill = ipif->ipif_ill;
1523 		ipif_ill_group = ipif_ill->ill_group;
1524 	}
1525 
1526 	/* No ire_addr_v6 bits set past the mask */
1527 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
1528 	    ire->ire_addr_v6));
1529 	V6_MASK_COPY(*addr, *mask, masked_addr);
1530 
1531 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
1532 	    ((!(match_flags & MATCH_IRE_GW)) ||
1533 	    IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
1534 	    ((!(match_flags & MATCH_IRE_TYPE)) ||
1535 	    (ire->ire_type & type)) &&
1536 	    ((!(match_flags & MATCH_IRE_SRC)) ||
1537 	    IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
1538 	    &ipif->ipif_v6src_addr)) &&
1539 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
1540 	    (ire->ire_ipif == ipif)) &&
1541 	    ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
1542 	    (ire->ire_type != IRE_CACHE ||
1543 	    ire->ire_marks & IRE_MARK_HIDDEN)) &&
1544 	    ((!(match_flags & MATCH_IRE_ILL)) ||
1545 	    (ire_ill == ipif_ill)) &&
1546 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
1547 	    (ire->ire_ihandle == ihandle)) &&
1548 	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
1549 	    (ire_ill == ipif_ill) ||
1550 	    (ire_ill_group != NULL &&
1551 	    ire_ill_group == ipif_ill_group)) &&
1552 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
1553 	    (!is_system_labeled()) ||
1554 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
1555 		/* We found the matched IRE */
1556 		return (B_TRUE);
1557 	}
1558 	return (B_FALSE);
1559 }
1560 
1561 /*
1562  * Lookup for a route in all the tables
1563  */
1564 ire_t *
1565 ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1566     const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
1567     zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
1568 {
1569 	ire_t *ire = NULL;
1570 
1571 	/*
1572 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
1573 	 * MATCH_IRE_ILL is set.
1574 	 */
1575 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
1576 	    (ipif == NULL))
1577 		return (NULL);
1578 
1579 	/*
1580 	 * might be asking for a cache lookup,
1581 	 * This is not best way to lookup cache,
1582 	 * user should call ire_cache_lookup directly.
1583 	 *
1584 	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
1585 	 * in the forwarding table, if the applicable type flags were set.
1586 	 */
1587 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
1588 		ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid,
1589 		    tsl, flags, ipst);
1590 		if (ire != NULL)
1591 			return (ire);
1592 	}
1593 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
1594 		ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif,
1595 		    pire, zoneid, 0, tsl, flags, ipst);
1596 	}
1597 	return (ire);
1598 }
1599 
1600 /*
1601  * Lookup a route in forwarding table.
1602  * specific lookup is indicated by passing the
1603  * required parameters and indicating the
1604  * match required in flag field.
1605  *
1606  * Looking for default route can be done in three ways
1607  * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field
1608  *    along with other matches.
1609  * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
1610  *    field along with other matches.
1611  * 3) if the destination and mask are passed as zeros.
1612  *
1613  * A request to return a default route if no route
1614  * is found, can be specified by setting MATCH_IRE_DEFAULT
1615  * in flags.
1616  *
1617  * It does not support recursion more than one level. It
1618  * will do recursive lookup only when the lookup maps to
1619  * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
1620  *
1621  * If the routing table is setup to allow more than one level
1622  * of recursion, the cleaning up cache table will not work resulting
1623  * in invalid routing.
1624  *
1625  * Supports link-local addresses by following the ipif/ill when recursing.
1626  *
1627  * NOTE : When this function returns NULL, pire has already been released.
1628  *	  pire is valid only when this function successfully returns an
1629  *	  ire.
1630  */
1631 ire_t *
1632 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1633     const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
1634     zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags,
1635     ip_stack_t *ipst)
1636 {
1637 	irb_t *irb_ptr;
1638 	ire_t	*rire;
1639 	ire_t *ire = NULL;
1640 	ire_t	*saved_ire;
1641 	nce_t	*nce;
1642 	int i;
1643 	in6_addr_t gw_addr_v6;
1644 
1645 	ASSERT(addr != NULL);
1646 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
1647 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
1648 	ASSERT(ipif == NULL || ipif->ipif_isv6);
1649 	ASSERT(!(flags & MATCH_IRE_WQ));
1650 
1651 	/*
1652 	 * When we return NULL from this function, we should make
1653 	 * sure that *pire is NULL so that the callers will not
1654 	 * wrongly REFRELE the pire.
1655 	 */
1656 	if (pire != NULL)
1657 		*pire = NULL;
1658 	/*
1659 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
1660 	 * MATCH_IRE_ILL is set.
1661 	 */
1662 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
1663 	    (ipif == NULL))
1664 		return (NULL);
1665 
1666 	/*
1667 	 * If the mask is known, the lookup
1668 	 * is simple, if the mask is not known
1669 	 * we need to search.
1670 	 */
1671 	if (flags & MATCH_IRE_MASK) {
1672 		uint_t masklen;
1673 
1674 		masklen = ip_mask_to_plen_v6(mask);
1675 		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL)
1676 			return (NULL);
1677 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
1678 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
1679 		    ipst->ips_ip6_ftable_hash_size)]);
1680 		rw_enter(&irb_ptr->irb_lock, RW_READER);
1681 		for (ire = irb_ptr->irb_ire; ire != NULL;
1682 		    ire = ire->ire_next) {
1683 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
1684 				continue;
1685 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
1686 			    ipif, zoneid, ihandle, tsl, flags))
1687 				goto found_ire;
1688 		}
1689 		rw_exit(&irb_ptr->irb_lock);
1690 	} else {
1691 		/*
1692 		 * In this case we don't know the mask, we need to
1693 		 * search the table assuming different mask sizes.
1694 		 * we start with 128 bit mask, we don't allow default here.
1695 		 */
1696 		for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) {
1697 			in6_addr_t tmpmask;
1698 
1699 			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
1700 				continue;
1701 			(void) ip_plen_to_mask_v6(i, &tmpmask);
1702 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
1703 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1704 			    ipst->ips_ip6_ftable_hash_size)];
1705 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1706 			for (ire = irb_ptr->irb_ire; ire != NULL;
1707 			    ire = ire->ire_next) {
1708 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1709 					continue;
1710 				if (ire_match_args_v6(ire, addr,
1711 				    &ire->ire_mask_v6, gateway, type, ipif,
1712 				    zoneid, ihandle, tsl, flags))
1713 					goto found_ire;
1714 			}
1715 			rw_exit(&irb_ptr->irb_lock);
1716 		}
1717 	}
1718 
1719 	/*
1720 	 * We come here if no route has yet been found.
1721 	 *
1722 	 * Handle the case where default route is
1723 	 * requested by specifying type as one of the possible
1724 	 * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE).
1725 	 *
1726 	 * If MATCH_IRE_MASK is specified, then the appropriate default route
1727 	 * would have been found above if it exists so it isn't looked up here.
1728 	 * If MATCH_IRE_DEFAULT was also specified, then a default route will be
1729 	 * searched for later.
1730 	 */
1731 	if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE &&
1732 	    (type & (IRE_DEFAULT | IRE_INTERFACE))) {
1733 		if (ipst->ips_ip_forwarding_table_v6[0] != NULL) {
1734 			/* addr & mask is zero for defaults */
1735 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[0][
1736 			    IRE_ADDR_HASH_V6(ipv6_all_zeros,
1737 			    ipst->ips_ip6_ftable_hash_size)];
1738 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1739 			for (ire = irb_ptr->irb_ire; ire != NULL;
1740 			    ire = ire->ire_next) {
1741 
1742 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1743 					continue;
1744 
1745 				if (ire_match_args_v6(ire, addr,
1746 				    &ipv6_all_zeros, gateway, type, ipif,
1747 				    zoneid, ihandle, tsl, flags))
1748 					goto found_ire;
1749 			}
1750 			rw_exit(&irb_ptr->irb_lock);
1751 		}
1752 	}
1753 	/*
1754 	 * We come here only if no route is found.
1755 	 * see if the default route can be used which is allowed
1756 	 * only if the default matching criteria is specified.
1757 	 * The ipv6_ire_default_count tracks the number of IRE_DEFAULT
1758 	 * entries. However, the ip_forwarding_table_v6[0] also contains
1759 	 * interface routes thus the count can be zero.
1760 	 */
1761 	saved_ire = NULL;
1762 	if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
1763 	    MATCH_IRE_DEFAULT) {
1764 		ire_t	*ire_origin;
1765 		uint_t	g_index;
1766 		uint_t	index;
1767 
1768 		if (ipst->ips_ip_forwarding_table_v6[0] == NULL)
1769 			return (NULL);
1770 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[0])[0];
1771 
1772 		/*
1773 		 * Keep a tab on the bucket while looking the IRE_DEFAULT
1774 		 * entries. We need to keep track of a particular IRE
1775 		 * (ire_origin) so this ensures that it will not be unlinked
1776 		 * from the hash list during the recursive lookup below.
1777 		 */
1778 		IRB_REFHOLD(irb_ptr);
1779 		ire = irb_ptr->irb_ire;
1780 		if (ire == NULL) {
1781 			IRB_REFRELE(irb_ptr);
1782 			return (NULL);
1783 		}
1784 
1785 		/*
1786 		 * Get the index first, since it can be changed by other
1787 		 * threads. Then get to the right default route skipping
1788 		 * default interface routes if any. As we hold a reference on
1789 		 * the IRE bucket, ipv6_ire_default_count can only increase so
1790 		 * we can't reach the end of the hash list unexpectedly.
1791 		 */
1792 		if (ipst->ips_ipv6_ire_default_count != 0) {
1793 			g_index = ipst->ips_ipv6_ire_default_index++;
1794 			index = g_index % ipst->ips_ipv6_ire_default_count;
1795 			while (index != 0) {
1796 				if (!(ire->ire_type & IRE_INTERFACE))
1797 					index--;
1798 				ire = ire->ire_next;
1799 			}
1800 			ASSERT(ire != NULL);
1801 		} else {
1802 			/*
1803 			 * No default route, so we only have default interface
1804 			 * routes: don't enter the first loop.
1805 			 */
1806 			ire = NULL;
1807 		}
1808 
1809 		/*
1810 		 * Round-robin the default routers list looking for a neighbor
1811 		 * that matches the passed in parameters and is reachable.  If
1812 		 * none found, just return a route from the default router list
1813 		 * if it exists. If we can't find a default route (IRE_DEFAULT),
1814 		 * look for interface default routes.
1815 		 * We start with the ire we found above and we walk the hash
1816 		 * list until we're back where we started, see
1817 		 * ire_get_next_default_ire(). It doesn't matter if default
1818 		 * routes are added or deleted by other threads - we know this
1819 		 * ire will stay in the list because we hold a reference on the
1820 		 * ire bucket.
1821 		 * NB: if we only have interface default routes, ire is NULL so
1822 		 * we don't even enter this loop (see above).
1823 		 */
1824 		ire_origin = ire;
1825 		for (; ire != NULL;
1826 		    ire = ire_get_next_default_ire(ire, ire_origin)) {
1827 
1828 			if (ire_match_args_v6(ire, addr,
1829 			    &ipv6_all_zeros, gateway, type, ipif,
1830 			    zoneid, ihandle, tsl, flags)) {
1831 				int match_flags;
1832 
1833 				/*
1834 				 * We have something to work with.
1835 				 * If we can find a resolved/reachable
1836 				 * entry, we will use this. Otherwise
1837 				 * we'll try to find an entry that has
1838 				 * a resolved cache entry. We will fallback
1839 				 * on this if we don't find anything else.
1840 				 */
1841 				if (saved_ire == NULL)
1842 					saved_ire = ire;
1843 				mutex_enter(&ire->ire_lock);
1844 				gw_addr_v6 = ire->ire_gateway_addr_v6;
1845 				mutex_exit(&ire->ire_lock);
1846 				match_flags = MATCH_IRE_ILL_GROUP |
1847 				    MATCH_IRE_SECATTR;
1848 				rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
1849 				    0, ire->ire_ipif, zoneid, tsl, match_flags,
1850 				    ipst);
1851 				if (rire != NULL) {
1852 					nce = rire->ire_nce;
1853 					if (nce != NULL &&
1854 					    NCE_ISREACHABLE(nce) &&
1855 					    nce->nce_flags & NCE_F_ISROUTER) {
1856 						ire_refrele(rire);
1857 						IRE_REFHOLD(ire);
1858 						IRB_REFRELE(irb_ptr);
1859 						goto found_ire_held;
1860 					} else if (nce != NULL &&
1861 					    !(nce->nce_flags &
1862 					    NCE_F_ISROUTER)) {
1863 						/*
1864 						 * Make sure we don't use
1865 						 * this ire
1866 						 */
1867 						if (saved_ire == ire)
1868 							saved_ire = NULL;
1869 					}
1870 					ire_refrele(rire);
1871 				} else if (ipst->
1872 				    ips_ipv6_ire_default_count > 1 &&
1873 				    zoneid != GLOBAL_ZONEID) {
1874 					/*
1875 					 * When we're in a local zone, we're
1876 					 * only interested in default routers
1877 					 * that are reachable through ipifs
1878 					 * within our zone.
1879 					 * The potentially expensive call to
1880 					 * ire_route_lookup_v6() is avoided when
1881 					 * we have only one default route.
1882 					 */
1883 					int ire_match_flags = MATCH_IRE_TYPE |
1884 					    MATCH_IRE_SECATTR;
1885 
1886 					if (ire->ire_ipif != NULL) {
1887 						ire_match_flags |=
1888 						    MATCH_IRE_ILL_GROUP;
1889 					}
1890 					rire = ire_route_lookup_v6(&gw_addr_v6,
1891 					    NULL, NULL, IRE_INTERFACE,
1892 					    ire->ire_ipif, NULL,
1893 					    zoneid, tsl, ire_match_flags, ipst);
1894 					if (rire != NULL) {
1895 						ire_refrele(rire);
1896 						saved_ire = ire;
1897 					} else if (saved_ire == ire) {
1898 						/*
1899 						 * Make sure we don't use
1900 						 * this ire
1901 						 */
1902 						saved_ire = NULL;
1903 					}
1904 				}
1905 			}
1906 		}
1907 		if (saved_ire != NULL) {
1908 			ire = saved_ire;
1909 			IRE_REFHOLD(ire);
1910 			IRB_REFRELE(irb_ptr);
1911 			goto found_ire_held;
1912 		} else {
1913 			/*
1914 			 * Look for a interface default route matching the
1915 			 * args passed in. No round robin here. Just pick
1916 			 * the right one.
1917 			 */
1918 			for (ire = irb_ptr->irb_ire; ire != NULL;
1919 			    ire = ire->ire_next) {
1920 
1921 				if (!(ire->ire_type & IRE_INTERFACE))
1922 					continue;
1923 
1924 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1925 					continue;
1926 
1927 				if (ire_match_args_v6(ire, addr,
1928 				    &ipv6_all_zeros, gateway, type, ipif,
1929 				    zoneid, ihandle, tsl, flags)) {
1930 					IRE_REFHOLD(ire);
1931 					IRB_REFRELE(irb_ptr);
1932 					goto found_ire_held;
1933 				}
1934 			}
1935 			IRB_REFRELE(irb_ptr);
1936 		}
1937 	}
1938 	ASSERT(ire == NULL);
1939 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1940 	return (NULL);
1941 found_ire:
1942 	ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0);
1943 	IRE_REFHOLD(ire);
1944 	rw_exit(&irb_ptr->irb_lock);
1945 
1946 found_ire_held:
1947 	if ((flags & MATCH_IRE_RJ_BHOLE) &&
1948 	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
1949 		return (ire);
1950 	}
1951 	/*
1952 	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
1953 	 * or IRE_CACHETABLE type.  If this is a recursive lookup and an
1954 	 * IRE_INTERFACE type was found, return that.  If it was some other
1955 	 * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it
1956 	 * is necessary to fill in the  parent IRE pointed to by pire, and
1957 	 * then lookup the gateway address of  the parent.  For backwards
1958 	 * compatiblity, if this lookup returns an
1959 	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
1960 	 * of lookup is done.
1961 	 */
1962 	if (flags & MATCH_IRE_RECURSIVE) {
1963 		const ipif_t *gw_ipif;
1964 		int match_flags = MATCH_IRE_DSTONLY;
1965 
1966 		if (ire->ire_type & IRE_INTERFACE)
1967 			return (ire);
1968 		if (pire != NULL)
1969 			*pire = ire;
1970 		/*
1971 		 * If we can't find an IRE_INTERFACE or the caller has not
1972 		 * asked for pire, we need to REFRELE the saved_ire.
1973 		 */
1974 		saved_ire = ire;
1975 
1976 		/*
1977 		 * Currently MATCH_IRE_ILL is never used with
1978 		 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
1979 		 * sending out packets as MATCH_IRE_ILL is used only
1980 		 * for communicating with on-link hosts. We can't assert
1981 		 * that here as RTM_GET calls this function with
1982 		 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
1983 		 * We have already used the MATCH_IRE_ILL in determining
1984 		 * the right prefix route at this point. To match the
1985 		 * behavior of how we locate routes while sending out
1986 		 * packets, we don't want to use MATCH_IRE_ILL below
1987 		 * while locating the interface route.
1988 		 */
1989 		if (ire->ire_ipif != NULL)
1990 			match_flags |= MATCH_IRE_ILL_GROUP;
1991 
1992 		mutex_enter(&ire->ire_lock);
1993 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1994 		mutex_exit(&ire->ire_lock);
1995 
1996 		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0,
1997 		    ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
1998 		if (ire == NULL) {
1999 			/*
2000 			 * In this case we have to deal with the
2001 			 * MATCH_IRE_PARENT flag, which means the
2002 			 * parent has to be returned if ire is NULL.
2003 			 * The aim of this is to have (at least) a starting
2004 			 * ire when we want to look at all of the ires in a
2005 			 * bucket aimed at a single destination (as is the
2006 			 * case in ip_newroute_v6 for the RTF_MULTIRT
2007 			 * flagged routes).
2008 			 */
2009 			if (flags & MATCH_IRE_PARENT) {
2010 				if (pire != NULL) {
2011 					/*
2012 					 * Need an extra REFHOLD, if the
2013 					 * parent ire is returned via both
2014 					 * ire and pire.
2015 					 */
2016 					IRE_REFHOLD(saved_ire);
2017 				}
2018 				ire = saved_ire;
2019 			} else {
2020 				ire_refrele(saved_ire);
2021 				if (pire != NULL)
2022 					*pire = NULL;
2023 			}
2024 			return (ire);
2025 		}
2026 		if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
2027 			/*
2028 			 * If the caller did not ask for pire, release
2029 			 * it now.
2030 			 */
2031 			if (pire == NULL) {
2032 				ire_refrele(saved_ire);
2033 			}
2034 			return (ire);
2035 		}
2036 		match_flags |= MATCH_IRE_TYPE;
2037 		mutex_enter(&ire->ire_lock);
2038 		gw_addr_v6 = ire->ire_gateway_addr_v6;
2039 		mutex_exit(&ire->ire_lock);
2040 		gw_ipif = ire->ire_ipif;
2041 		ire_refrele(ire);
2042 		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL,
2043 		    (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid,
2044 		    NULL, match_flags, ipst);
2045 		if (ire == NULL) {
2046 			/*
2047 			 * In this case we have to deal with the
2048 			 * MATCH_IRE_PARENT flag, which means the
2049 			 * parent has to be returned if ire is NULL.
2050 			 * The aim of this is to have (at least) a starting
2051 			 * ire when we want to look at all of the ires in a
2052 			 * bucket aimed at a single destination (as is the
2053 			 * case in ip_newroute_v6 for the RTF_MULTIRT
2054 			 * flagged routes).
2055 			 */
2056 			if (flags & MATCH_IRE_PARENT) {
2057 				if (pire != NULL) {
2058 					/*
2059 					 * Need an extra REFHOLD, if the
2060 					 * parent ire is returned via both
2061 					 * ire and pire.
2062 					 */
2063 					IRE_REFHOLD(saved_ire);
2064 				}
2065 				ire = saved_ire;
2066 			} else {
2067 				ire_refrele(saved_ire);
2068 				if (pire != NULL)
2069 					*pire = NULL;
2070 			}
2071 			return (ire);
2072 		} else if (pire == NULL) {
2073 			/*
2074 			 * If the caller did not ask for pire, release
2075 			 * it now.
2076 			 */
2077 			ire_refrele(saved_ire);
2078 		}
2079 		return (ire);
2080 	}
2081 
2082 	ASSERT(pire == NULL || *pire == NULL);
2083 	return (ire);
2084 }
2085 
2086 /*
2087  * Delete the IRE cache for the gateway and all IRE caches whose
2088  * ire_gateway_addr_v6 points to this gateway, and allow them to
2089  * be created on demand by ip_newroute_v6.
2090  */
2091 void
2092 ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid,
2093 	ip_stack_t *ipst)
2094 {
2095 	irb_t *irb;
2096 	ire_t *ire;
2097 
2098 	irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2099 	    ipst->ips_ip6_cache_table_size)];
2100 	IRB_REFHOLD(irb);
2101 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2102 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
2103 			continue;
2104 
2105 		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
2106 		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0,
2107 		    IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) {
2108 			ire_delete(ire);
2109 		}
2110 	}
2111 	IRB_REFRELE(irb);
2112 
2113 	ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid, ipst);
2114 }
2115 
2116 /*
2117  * Looks up cache table for a route.
2118  * specific lookup can be indicated by
2119  * passing the MATCH_* flags and the
2120  * necessary parameters.
2121  */
2122 ire_t *
2123 ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
2124     int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl,
2125     int flags, ip_stack_t *ipst)
2126 {
2127 	ire_t *ire;
2128 	irb_t *irb_ptr;
2129 	ASSERT(addr != NULL);
2130 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
2131 
2132 	/*
2133 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
2134 	 * MATCH_IRE_ILL is set.
2135 	 */
2136 	if ((flags & (MATCH_IRE_SRC |  MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
2137 	    (ipif == NULL))
2138 		return (NULL);
2139 
2140 	irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2141 	    ipst->ips_ip6_cache_table_size)];
2142 	rw_enter(&irb_ptr->irb_lock, RW_READER);
2143 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
2144 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
2145 			continue;
2146 
2147 		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
2148 		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, gateway,
2149 		    type, ipif, zoneid, 0, tsl, flags)) {
2150 			IRE_REFHOLD(ire);
2151 			rw_exit(&irb_ptr->irb_lock);
2152 			return (ire);
2153 		}
2154 	}
2155 	rw_exit(&irb_ptr->irb_lock);
2156 	return (NULL);
2157 }
2158 
2159 /*
2160  * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
2161  * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
2162  * to the hidden ones.
2163  *
2164  * In general the zoneid has to match (where ALL_ZONES match all of them).
2165  * But for IRE_LOCAL we also need to handle the case where L2 should
2166  * conceptually loop back the packet. This is necessary since neither
2167  * Ethernet drivers nor Ethernet hardware loops back packets sent to their
2168  * own MAC address. This loopback is needed when the normal
2169  * routes (ignoring IREs with different zoneids) would send out the packet on
2170  * the same ill (or ill group) as the ill with which this IRE_LOCAL is
2171  * associated.
2172  *
2173  * Earlier versions of this code always matched an IRE_LOCAL independently of
2174  * the zoneid. We preserve that earlier behavior when
2175  * ip_restrict_interzone_loopback is turned off.
2176  */
2177 ire_t *
2178 ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
2179     const ts_label_t *tsl, ip_stack_t *ipst)
2180 {
2181 	irb_t *irb_ptr;
2182 	ire_t *ire;
2183 
2184 	irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2185 	    ipst->ips_ip6_cache_table_size)];
2186 	rw_enter(&irb_ptr->irb_lock, RW_READER);
2187 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
2188 		if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
2189 			continue;
2190 		if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
2191 			/*
2192 			 * Finally, check if the security policy has any
2193 			 * restriction on using this route for the specified
2194 			 * message.
2195 			 */
2196 			if (tsl != NULL &&
2197 			    ire->ire_gw_secattr != NULL &&
2198 			    tsol_ire_match_gwattr(ire, tsl) != 0) {
2199 				continue;
2200 			}
2201 
2202 			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
2203 			    ire->ire_zoneid == ALL_ZONES) {
2204 				IRE_REFHOLD(ire);
2205 				rw_exit(&irb_ptr->irb_lock);
2206 				return (ire);
2207 			}
2208 
2209 			if (ire->ire_type == IRE_LOCAL) {
2210 				if (ipst->ips_ip_restrict_interzone_loopback &&
2211 				    !ire_local_ok_across_zones(ire, zoneid,
2212 				    (void *)addr, tsl, ipst))
2213 					continue;
2214 
2215 				IRE_REFHOLD(ire);
2216 				rw_exit(&irb_ptr->irb_lock);
2217 				return (ire);
2218 			}
2219 		}
2220 	}
2221 	rw_exit(&irb_ptr->irb_lock);
2222 	return (NULL);
2223 }
2224 
2225 /*
2226  * Locate the interface ire that is tied to the cache ire 'cire' via
2227  * cire->ire_ihandle.
2228  *
2229  * We are trying to create the cache ire for an onlink destn. or
2230  * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER
2231  * case for xresolv interfaces, after the ire has come back from
2232  * an external resolver.
2233  */
2234 static ire_t *
2235 ire_ihandle_lookup_onlink_v6(ire_t *cire)
2236 {
2237 	ire_t	*ire;
2238 	int	match_flags;
2239 	int	i;
2240 	int	j;
2241 	irb_t	*irb_ptr;
2242 	ip_stack_t	*ipst = cire->ire_ipst;
2243 
2244 	ASSERT(cire != NULL);
2245 
2246 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
2247 	/*
2248 	 * We know that the mask of the interface ire equals cire->ire_cmask.
2249 	 * (When ip_newroute_v6() created 'cire' for an on-link destn.
2250 	 * it set its cmask from the interface ire's mask)
2251 	 */
2252 	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6,
2253 	    NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
2254 	    NULL, match_flags, ipst);
2255 	if (ire != NULL)
2256 		return (ire);
2257 	/*
2258 	 * If we didn't find an interface ire above, we can't declare failure.
2259 	 * For backwards compatibility, we need to support prefix routes
2260 	 * pointing to next hop gateways that are not on-link.
2261 	 *
2262 	 * In the resolver/noresolver case, ip_newroute_v6() thinks
2263 	 * it is creating the cache ire for an onlink destination in 'cire'.
2264 	 * But 'cire' is not actually onlink, because ire_ftable_lookup_v6()
2265 	 * cheated it, by doing ire_route_lookup_v6() twice and returning an
2266 	 * interface ire.
2267 	 *
2268 	 * Eg. default	-	gw1			(line 1)
2269 	 *	gw1	-	gw2			(line 2)
2270 	 *	gw2	-	hme0			(line 3)
2271 	 *
2272 	 * In the above example, ip_newroute_v6() tried to create the cache ire
2273 	 * 'cire' for gw1, based on the interface route in line 3. The
2274 	 * ire_ftable_lookup_v6() above fails, because there is
2275 	 * no interface route to reach gw1. (it is gw2). We fall thru below.
2276 	 *
2277 	 * Do a brute force search based on the ihandle in a subset of the
2278 	 * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise
2279 	 * things become very complex, since we don't have 'pire' in this
2280 	 * case. (Also note that this method is not possible in the offlink
2281 	 * case because we don't know the mask)
2282 	 */
2283 	i = ip_mask_to_plen_v6(&cire->ire_cmask_v6);
2284 	if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
2285 		return (NULL);
2286 	for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
2287 		irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][j];
2288 		rw_enter(&irb_ptr->irb_lock, RW_READER);
2289 		for (ire = irb_ptr->irb_ire; ire != NULL;
2290 		    ire = ire->ire_next) {
2291 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
2292 				continue;
2293 			if ((ire->ire_type & IRE_INTERFACE) &&
2294 			    (ire->ire_ihandle == cire->ire_ihandle)) {
2295 				IRE_REFHOLD(ire);
2296 				rw_exit(&irb_ptr->irb_lock);
2297 				return (ire);
2298 			}
2299 		}
2300 		rw_exit(&irb_ptr->irb_lock);
2301 	}
2302 	return (NULL);
2303 }
2304 
2305 
2306 /*
2307  * Locate the interface ire that is tied to the cache ire 'cire' via
2308  * cire->ire_ihandle.
2309  *
2310  * We are trying to create the cache ire for an offlink destn based
2311  * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
2312  * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in
2313  * the IRE_CACHE case.
2314  */
2315 ire_t *
2316 ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
2317 {
2318 	ire_t	*ire;
2319 	int	match_flags;
2320 	in6_addr_t	gw_addr;
2321 	ipif_t		*gw_ipif;
2322 	ip_stack_t	*ipst = cire->ire_ipst;
2323 
2324 	ASSERT(cire != NULL && pire != NULL);
2325 
2326 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
2327 	/*
2328 	 * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only
2329 	 * for on-link hosts. We should never be here for onlink.
2330 	 * Thus, use MATCH_IRE_ILL_GROUP.
2331 	 */
2332 	if (pire->ire_ipif != NULL)
2333 		match_flags |= MATCH_IRE_ILL_GROUP;
2334 	/*
2335 	 * We know that the mask of the interface ire equals cire->ire_cmask.
2336 	 * (When ip_newroute_v6() created 'cire' for an on-link destn. it set
2337 	 * its cmask from the interface ire's mask)
2338 	 */
2339 	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0,
2340 	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
2341 	    NULL, match_flags, ipst);
2342 	if (ire != NULL)
2343 		return (ire);
2344 	/*
2345 	 * If we didn't find an interface ire above, we can't declare failure.
2346 	 * For backwards compatibility, we need to support prefix routes
2347 	 * pointing to next hop gateways that are not on-link.
2348 	 *
2349 	 * Assume we are trying to ping some offlink destn, and we have the
2350 	 * routing table below.
2351 	 *
2352 	 * Eg.	default	- gw1		<--- pire	(line 1)
2353 	 *	gw1	- gw2				(line 2)
2354 	 *	gw2	- hme0				(line 3)
2355 	 *
2356 	 * If we already have a cache ire for gw1 in 'cire', the
2357 	 * ire_ftable_lookup_v6 above would have failed, since there is no
2358 	 * interface ire to reach gw1. We will fallthru below.
2359 	 *
2360 	 * Here we duplicate the steps that ire_ftable_lookup_v6() did in
2361 	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
2362 	 * The differences are the following
2363 	 * i.   We want the interface ire only, so we call
2364 	 *	ire_ftable_lookup_v6() instead of ire_route_lookup_v6()
2365 	 * ii.  We look for only prefix routes in the 1st call below.
2366 	 * ii.  We want to match on the ihandle in the 2nd call below.
2367 	 */
2368 	match_flags =  MATCH_IRE_TYPE;
2369 	if (pire->ire_ipif != NULL)
2370 		match_flags |= MATCH_IRE_ILL_GROUP;
2371 
2372 	mutex_enter(&pire->ire_lock);
2373 	gw_addr = pire->ire_gateway_addr_v6;
2374 	mutex_exit(&pire->ire_lock);
2375 	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET,
2376 	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
2377 	if (ire == NULL)
2378 		return (NULL);
2379 	/*
2380 	 * At this point 'ire' corresponds to the entry shown in line 2.
2381 	 * gw_addr is 'gw2' in the example above.
2382 	 */
2383 	mutex_enter(&ire->ire_lock);
2384 	gw_addr = ire->ire_gateway_addr_v6;
2385 	mutex_exit(&ire->ire_lock);
2386 	gw_ipif = ire->ire_ipif;
2387 	ire_refrele(ire);
2388 
2389 	match_flags |= MATCH_IRE_IHANDLE;
2390 	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE,
2391 	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
2392 	    NULL, match_flags, ipst);
2393 	return (ire);
2394 }
2395 
2396 /*
2397  * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
2398  * ire associated with the specified ipif.
2399  *
2400  * This might occasionally be called when IPIF_UP is not set since
2401  * the IPV6_MULTICAST_IF as well as creating interface routes
2402  * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
2403  *
2404  * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
2405  * the ipif this routine might return NULL.
2406  * (Sometimes called as writer though not required by this function.)
2407  */
2408 ire_t *
2409 ipif_to_ire_v6(const ipif_t *ipif)
2410 {
2411 	ire_t	*ire;
2412 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2413 
2414 	ASSERT(ipif->ipif_isv6);
2415 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
2416 		ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
2417 		    IRE_LOOPBACK, ipif, ALL_ZONES, NULL,
2418 		    (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ipst);
2419 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
2420 		/* In this case we need to lookup destination address. */
2421 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
2422 		    &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
2423 		    0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
2424 		    MATCH_IRE_MASK), ipst);
2425 	} else {
2426 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
2427 		    &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
2428 		    ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
2429 		    MATCH_IRE_MASK), ipst);
2430 	}
2431 	return (ire);
2432 }
2433 
2434 /*
2435  * Return B_TRUE if a multirt route is resolvable
2436  * (or if no route is resolved yet), B_FALSE otherwise.
2437  * This only works in the global zone.
2438  */
2439 boolean_t
2440 ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl,
2441     ip_stack_t *ipst)
2442 {
2443 	ire_t	*first_fire;
2444 	ire_t	*first_cire;
2445 	ire_t	*fire;
2446 	ire_t	*cire;
2447 	irb_t	*firb;
2448 	irb_t	*cirb;
2449 	int	unres_cnt = 0;
2450 	boolean_t resolvable = B_FALSE;
2451 
2452 	/* Retrieve the first IRE_HOST that matches the destination */
2453 	first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST,
2454 	    NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE |
2455 	    MATCH_IRE_SECATTR, ipst);
2456 
2457 	/* No route at all */
2458 	if (first_fire == NULL) {
2459 		return (B_TRUE);
2460 	}
2461 
2462 	firb = first_fire->ire_bucket;
2463 	ASSERT(firb);
2464 
2465 	/* Retrieve the first IRE_CACHE ire for that destination. */
2466 	first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl, ipst);
2467 
2468 	/* No resolved route. */
2469 	if (first_cire == NULL) {
2470 		ire_refrele(first_fire);
2471 		return (B_TRUE);
2472 	}
2473 
2474 	/* At least one route is resolved. */
2475 
2476 	cirb = first_cire->ire_bucket;
2477 	ASSERT(cirb);
2478 
2479 	/* Count the number of routes to that dest that are declared. */
2480 	IRB_REFHOLD(firb);
2481 	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
2482 		if (!(fire->ire_flags & RTF_MULTIRT))
2483 			continue;
2484 		if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp))
2485 			continue;
2486 		unres_cnt++;
2487 	}
2488 	IRB_REFRELE(firb);
2489 
2490 
2491 	/* Then subtract the number of routes to that dst that are resolved */
2492 	IRB_REFHOLD(cirb);
2493 	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
2494 		if (!(cire->ire_flags & RTF_MULTIRT))
2495 			continue;
2496 		if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
2497 			continue;
2498 		if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
2499 			continue;
2500 		unres_cnt--;
2501 	}
2502 	IRB_REFRELE(cirb);
2503 
2504 	/* At least one route is unresolved; search for a resolvable route. */
2505 	if (unres_cnt > 0)
2506 		resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire,
2507 		    MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl, ipst);
2508 
2509 	if (first_fire)
2510 		ire_refrele(first_fire);
2511 
2512 	if (first_cire)
2513 		ire_refrele(first_cire);
2514 
2515 	return (resolvable);
2516 }
2517 
2518 
2519 /*
2520  * Return B_TRUE and update *ire_arg and *fire_arg
2521  * if at least one resolvable route is found.
2522  * Return B_FALSE otherwise (all routes are resolved or
2523  * the remaining unresolved routes are all unresolvable).
2524  * This only works in the global zone.
2525  */
2526 boolean_t
2527 ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
2528     const ts_label_t *tsl, ip_stack_t *ipst)
2529 {
2530 	clock_t	delta;
2531 	ire_t	*best_fire = NULL;
2532 	ire_t	*best_cire = NULL;
2533 	ire_t	*first_fire;
2534 	ire_t	*first_cire;
2535 	ire_t	*fire;
2536 	ire_t	*cire;
2537 	irb_t	*firb = NULL;
2538 	irb_t	*cirb = NULL;
2539 	ire_t	*gw_ire;
2540 	boolean_t	already_resolved;
2541 	boolean_t	res;
2542 	in6_addr_t	v6dst;
2543 	in6_addr_t	v6gw;
2544 
2545 	ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, "
2546 	    "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags));
2547 
2548 	ASSERT(ire_arg);
2549 	ASSERT(fire_arg);
2550 
2551 	/* Not an IRE_HOST ire; give up. */
2552 	if ((*fire_arg == NULL) ||
2553 	    ((*fire_arg)->ire_type != IRE_HOST)) {
2554 		return (B_FALSE);
2555 	}
2556 
2557 	/* This is the first IRE_HOST ire for that destination. */
2558 	first_fire = *fire_arg;
2559 	firb = first_fire->ire_bucket;
2560 	ASSERT(firb);
2561 
2562 	mutex_enter(&first_fire->ire_lock);
2563 	v6dst = first_fire->ire_addr_v6;
2564 	mutex_exit(&first_fire->ire_lock);
2565 
2566 	ip2dbg(("ire_multirt_lookup_v6: dst %08x\n",
2567 	    ntohl(V4_PART_OF_V6(v6dst))));
2568 
2569 	/*
2570 	 * Retrieve the first IRE_CACHE ire for that destination;
2571 	 * if we don't find one, no route for that dest is
2572 	 * resolved yet.
2573 	 */
2574 	first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl, ipst);
2575 	if (first_cire) {
2576 		cirb = first_cire->ire_bucket;
2577 	}
2578 
2579 	ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire));
2580 
2581 	/*
2582 	 * Search for a resolvable route, giving the top priority
2583 	 * to routes that can be resolved without any call to the resolver.
2584 	 */
2585 	IRB_REFHOLD(firb);
2586 
2587 	if (!IN6_IS_ADDR_MULTICAST(&v6dst)) {
2588 		/*
2589 		 * For all multiroute IRE_HOST ires for that destination,
2590 		 * check if the route via the IRE_HOST's gateway is
2591 		 * resolved yet.
2592 		 */
2593 		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
2594 
2595 			if (!(fire->ire_flags & RTF_MULTIRT))
2596 				continue;
2597 			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
2598 				continue;
2599 
2600 			if (fire->ire_gw_secattr != NULL &&
2601 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
2602 				continue;
2603 			}
2604 
2605 			mutex_enter(&fire->ire_lock);
2606 			v6gw = fire->ire_gateway_addr_v6;
2607 			mutex_exit(&fire->ire_lock);
2608 
2609 			ip2dbg(("ire_multirt_lookup_v6: fire %p, "
2610 			    "ire_addr %08x, ire_gateway_addr %08x\n",
2611 			    (void *)fire,
2612 			    ntohl(V4_PART_OF_V6(fire->ire_addr_v6)),
2613 			    ntohl(V4_PART_OF_V6(v6gw))));
2614 
2615 			already_resolved = B_FALSE;
2616 
2617 			if (first_cire) {
2618 				ASSERT(cirb);
2619 
2620 				IRB_REFHOLD(cirb);
2621 				/*
2622 				 * For all IRE_CACHE ires for that
2623 				 * destination.
2624 				 */
2625 				for (cire = first_cire;
2626 				    cire != NULL;
2627 				    cire = cire->ire_next) {
2628 
2629 					if (!(cire->ire_flags & RTF_MULTIRT))
2630 						continue;
2631 					if (!IN6_ARE_ADDR_EQUAL(
2632 					    &cire->ire_addr_v6, &v6dst))
2633 						continue;
2634 					if (cire->ire_marks &
2635 					    (IRE_MARK_CONDEMNED|
2636 					    IRE_MARK_HIDDEN))
2637 						continue;
2638 
2639 					if (cire->ire_gw_secattr != NULL &&
2640 					    tsol_ire_match_gwattr(cire,
2641 					    tsl) != 0) {
2642 						continue;
2643 					}
2644 
2645 					/*
2646 					 * Check if the IRE_CACHE's gateway
2647 					 * matches the IRE_HOST's gateway.
2648 					 */
2649 					if (IN6_ARE_ADDR_EQUAL(
2650 					    &cire->ire_gateway_addr_v6,
2651 					    &v6gw)) {
2652 						already_resolved = B_TRUE;
2653 						break;
2654 					}
2655 				}
2656 				IRB_REFRELE(cirb);
2657 			}
2658 
2659 			/*
2660 			 * This route is already resolved;
2661 			 * proceed with next one.
2662 			 */
2663 			if (already_resolved) {
2664 				ip2dbg(("ire_multirt_lookup_v6: found cire %p, "
2665 				    "already resolved\n", (void *)cire));
2666 				continue;
2667 			}
2668 
2669 			/*
2670 			 * The route is unresolved; is it actually
2671 			 * resolvable, i.e. is there a cache or a resolver
2672 			 * for the gateway?
2673 			 */
2674 			gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL,
2675 			    ALL_ZONES, tsl, MATCH_IRE_RECURSIVE |
2676 			    MATCH_IRE_SECATTR, ipst);
2677 
2678 			ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n",
2679 			    (void *)gw_ire));
2680 
2681 			/*
2682 			 * This route can be resolved without any call to the
2683 			 * resolver; if the MULTIRT_CACHEGW flag is set,
2684 			 * give the top priority to this ire and exit the
2685 			 * loop.
2686 			 * This occurs when an resolver reply is processed
2687 			 * through ip_wput_nondata()
2688 			 */
2689 			if ((flags & MULTIRT_CACHEGW) &&
2690 			    (gw_ire != NULL) &&
2691 			    (gw_ire->ire_type & IRE_CACHETABLE)) {
2692 				/*
2693 				 * Release the resolver associated to the
2694 				 * previous candidate best ire, if any.
2695 				 */
2696 				if (best_cire) {
2697 					ire_refrele(best_cire);
2698 					ASSERT(best_fire);
2699 				}
2700 
2701 				best_fire = fire;
2702 				best_cire = gw_ire;
2703 
2704 				ip2dbg(("ire_multirt_lookup_v6: found top prio "
2705 				    "best_fire %p, best_cire %p\n",
2706 				    (void *)best_fire, (void *)best_cire));
2707 				break;
2708 			}
2709 
2710 			/*
2711 			 * Compute the time elapsed since our preceding
2712 			 * attempt to  resolve that route.
2713 			 * If the MULTIRT_USESTAMP flag is set, we take that
2714 			 * route into account only if this time interval
2715 			 * exceeds ip_multirt_resolution_interval;
2716 			 * this prevents us from attempting to resolve a
2717 			 * broken route upon each sending of a packet.
2718 			 */
2719 			delta = lbolt - fire->ire_last_used_time;
2720 			delta = TICK_TO_MSEC(delta);
2721 
2722 			res = (boolean_t)
2723 			    ((delta > ipst->
2724 			    ips_ip_multirt_resolution_interval) ||
2725 			    (!(flags & MULTIRT_USESTAMP)));
2726 
2727 			ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, "
2728 			    "res %d\n",
2729 			    (void *)fire, delta, res));
2730 
2731 			if (res) {
2732 				/*
2733 				 * A resolver exists for the gateway: save
2734 				 * the current IRE_HOST ire as a candidate
2735 				 * best ire. If we later discover that a
2736 				 * top priority ire exists (i.e. no need to
2737 				 * call the resolver), then this new ire
2738 				 * will be preferred to the current one.
2739 				 */
2740 				if (gw_ire != NULL) {
2741 					if (best_fire == NULL) {
2742 						ASSERT(best_cire == NULL);
2743 
2744 						best_fire = fire;
2745 						best_cire = gw_ire;
2746 
2747 						ip2dbg(("ire_multirt_lookup_v6:"
2748 						    "found candidate "
2749 						    "best_fire %p, "
2750 						    "best_cire %p\n",
2751 						    (void *)best_fire,
2752 						    (void *)best_cire));
2753 
2754 						/*
2755 						 * If MULTIRT_CACHEGW is not
2756 						 * set, we ignore the top
2757 						 * priority ires that can
2758 						 * be resolved without any
2759 						 * call to the resolver;
2760 						 * In that case, there is
2761 						 * actually no need
2762 						 * to continue the loop.
2763 						 */
2764 						if (!(flags &
2765 						    MULTIRT_CACHEGW)) {
2766 							break;
2767 						}
2768 						continue;
2769 					}
2770 				} else {
2771 					/*
2772 					 * No resolver for the gateway: the
2773 					 * route is not resolvable.
2774 					 * If the MULTIRT_SETSTAMP flag is
2775 					 * set, we stamp the IRE_HOST ire,
2776 					 * so we will not select it again
2777 					 * during this resolution interval.
2778 					 */
2779 					if (flags & MULTIRT_SETSTAMP)
2780 						fire->ire_last_used_time =
2781 						    lbolt;
2782 				}
2783 			}
2784 
2785 			if (gw_ire != NULL)
2786 				ire_refrele(gw_ire);
2787 		}
2788 	} else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */
2789 
2790 		for (fire = first_fire;
2791 		    fire != NULL;
2792 		    fire = fire->ire_next) {
2793 
2794 			if (!(fire->ire_flags & RTF_MULTIRT))
2795 				continue;
2796 			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
2797 				continue;
2798 
2799 			if (fire->ire_gw_secattr != NULL &&
2800 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
2801 				continue;
2802 			}
2803 
2804 			already_resolved = B_FALSE;
2805 
2806 			mutex_enter(&fire->ire_lock);
2807 			v6gw = fire->ire_gateway_addr_v6;
2808 			mutex_exit(&fire->ire_lock);
2809 
2810 			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
2811 			    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl,
2812 			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
2813 			    MATCH_IRE_SECATTR, ipst);
2814 
2815 			/* No resolver for the gateway; we skip this ire. */
2816 			if (gw_ire == NULL) {
2817 				continue;
2818 			}
2819 
2820 			if (first_cire) {
2821 
2822 				IRB_REFHOLD(cirb);
2823 				/*
2824 				 * For all IRE_CACHE ires for that
2825 				 * destination.
2826 				 */
2827 				for (cire = first_cire;
2828 				    cire != NULL;
2829 				    cire = cire->ire_next) {
2830 
2831 					if (!(cire->ire_flags & RTF_MULTIRT))
2832 						continue;
2833 					if (!IN6_ARE_ADDR_EQUAL(
2834 					    &cire->ire_addr_v6, &v6dst))
2835 						continue;
2836 					if (cire->ire_marks &
2837 					    (IRE_MARK_CONDEMNED|
2838 					    IRE_MARK_HIDDEN))
2839 						continue;
2840 
2841 					if (cire->ire_gw_secattr != NULL &&
2842 					    tsol_ire_match_gwattr(cire,
2843 					    tsl) != 0) {
2844 						continue;
2845 					}
2846 
2847 					/*
2848 					 * Cache entries are linked to the
2849 					 * parent routes using the parent handle
2850 					 * (ire_phandle). If no cache entry has
2851 					 * the same handle as fire, fire is
2852 					 * still unresolved.
2853 					 */
2854 					ASSERT(cire->ire_phandle != 0);
2855 					if (cire->ire_phandle ==
2856 					    fire->ire_phandle) {
2857 						already_resolved = B_TRUE;
2858 						break;
2859 					}
2860 				}
2861 				IRB_REFRELE(cirb);
2862 			}
2863 
2864 			/*
2865 			 * This route is already resolved; proceed with
2866 			 * next one.
2867 			 */
2868 			if (already_resolved) {
2869 				ire_refrele(gw_ire);
2870 				continue;
2871 			}
2872 
2873 			/*
2874 			 * Compute the time elapsed since our preceding
2875 			 * attempt to resolve that route.
2876 			 * If the MULTIRT_USESTAMP flag is set, we take
2877 			 * that route into account only if this time
2878 			 * interval exceeds ip_multirt_resolution_interval;
2879 			 * this prevents us from attempting to resolve a
2880 			 * broken route upon each sending of a packet.
2881 			 */
2882 			delta = lbolt - fire->ire_last_used_time;
2883 			delta = TICK_TO_MSEC(delta);
2884 
2885 			res = (boolean_t)
2886 			    ((delta > ipst->
2887 			    ips_ip_multirt_resolution_interval) ||
2888 			    (!(flags & MULTIRT_USESTAMP)));
2889 
2890 			ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, "
2891 			    "flags %04x, res %d\n",
2892 			    (void *)fire, delta, flags, res));
2893 
2894 			if (res) {
2895 				if (best_cire) {
2896 					/*
2897 					 * Release the resolver associated
2898 					 * to the preceding candidate best
2899 					 * ire, if any.
2900 					 */
2901 					ire_refrele(best_cire);
2902 					ASSERT(best_fire);
2903 				}
2904 				best_fire = fire;
2905 				best_cire = gw_ire;
2906 				continue;
2907 			}
2908 
2909 			ire_refrele(gw_ire);
2910 		}
2911 	}
2912 
2913 	if (best_fire) {
2914 		IRE_REFHOLD(best_fire);
2915 	}
2916 	IRB_REFRELE(firb);
2917 
2918 	/* Release the first IRE_CACHE we initially looked up, if any. */
2919 	if (first_cire)
2920 		ire_refrele(first_cire);
2921 
2922 	/* Found a resolvable route. */
2923 	if (best_fire) {
2924 		ASSERT(best_cire);
2925 
2926 		if (*fire_arg)
2927 			ire_refrele(*fire_arg);
2928 		if (*ire_arg)
2929 			ire_refrele(*ire_arg);
2930 
2931 		/*
2932 		 * Update the passed arguments with the
2933 		 * resolvable multirt route we found
2934 		 */
2935 		*fire_arg = best_fire;
2936 		*ire_arg = best_cire;
2937 
2938 		ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, "
2939 		    "*fire_arg %p, *ire_arg %p\n",
2940 		    (void *)best_fire, (void *)best_cire));
2941 
2942 		return (B_TRUE);
2943 	}
2944 
2945 	ASSERT(best_cire == NULL);
2946 
2947 	ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, "
2948 	    "*ire_arg %p\n",
2949 	    (void *)*fire_arg, (void *)*ire_arg));
2950 
2951 	/* No resolvable route. */
2952 	return (B_FALSE);
2953 }
2954 
2955 
2956 /*
2957  * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp'
2958  * that goes through 'ipif'. As a fallback, a route that goes through
2959  * ipif->ipif_ill can be returned.
2960  */
2961 ire_t *
2962 ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp)
2963 {
2964 	ire_t	*ire;
2965 	ire_t	*save_ire = NULL;
2966 	ire_t   *gw_ire;
2967 	irb_t   *irb;
2968 	in6_addr_t v6gw;
2969 	int	match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
2970 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2971 
2972 	ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
2973 	    NULL, MATCH_IRE_DEFAULT, ipst);
2974 
2975 	if (ire == NULL)
2976 		return (NULL);
2977 
2978 	irb = ire->ire_bucket;
2979 	ASSERT(irb);
2980 
2981 	IRB_REFHOLD(irb);
2982 	ire_refrele(ire);
2983 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2984 		if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) ||
2985 		    (ipif->ipif_zoneid != ire->ire_zoneid &&
2986 		    ire->ire_zoneid != ALL_ZONES)) {
2987 			continue;
2988 		}
2989 
2990 		switch (ire->ire_type) {
2991 		case IRE_DEFAULT:
2992 		case IRE_PREFIX:
2993 		case IRE_HOST:
2994 			mutex_enter(&ire->ire_lock);
2995 			v6gw = ire->ire_gateway_addr_v6;
2996 			mutex_exit(&ire->ire_lock);
2997 			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
2998 			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
2999 			    NULL, match_flags, ipst);
3000 
3001 			if (gw_ire != NULL) {
3002 				if (save_ire != NULL) {
3003 					ire_refrele(save_ire);
3004 				}
3005 				IRE_REFHOLD(ire);
3006 				if (gw_ire->ire_ipif == ipif) {
3007 					ire_refrele(gw_ire);
3008 
3009 					IRB_REFRELE(irb);
3010 					return (ire);
3011 				}
3012 				ire_refrele(gw_ire);
3013 				save_ire = ire;
3014 			}
3015 			break;
3016 		case IRE_IF_NORESOLVER:
3017 		case IRE_IF_RESOLVER:
3018 			if (ire->ire_ipif == ipif) {
3019 				if (save_ire != NULL) {
3020 					ire_refrele(save_ire);
3021 				}
3022 				IRE_REFHOLD(ire);
3023 
3024 				IRB_REFRELE(irb);
3025 				return (ire);
3026 			}
3027 			break;
3028 		}
3029 	}
3030 	IRB_REFRELE(irb);
3031 
3032 	return (save_ire);
3033 }
3034