xref: /illumos-gate/usr/src/uts/common/inet/ip/ip6_ire.c (revision ac20c57d6652cecf7859e3346336b9a48e5d5f82)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 1990 Mentat Inc.
27  */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 /*
32  * This file contains routines that manipulate Internet Routing Entries (IREs).
33  */
34 #include <sys/types.h>
35 #include <sys/stream.h>
36 #include <sys/stropts.h>
37 #include <sys/ddi.h>
38 #include <sys/cmn_err.h>
39 
40 #include <sys/systm.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <net/if.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <net/if_dl.h>
47 #include <netinet/ip6.h>
48 #include <netinet/icmp6.h>
49 
50 #include <inet/common.h>
51 #include <inet/mi.h>
52 #include <inet/ip.h>
53 #include <inet/ip6.h>
54 #include <inet/ip_ndp.h>
55 #include <inet/ip_if.h>
56 #include <inet/ip_ire.h>
57 #include <inet/ipclassifier.h>
58 #include <inet/nd.h>
59 #include <sys/kmem.h>
60 #include <sys/zone.h>
61 
62 #include <sys/tsol/label.h>
63 #include <sys/tsol/tnet.h>
64 
65 static	ire_t	ire_null;
66 
67 static ire_t	*ire_ihandle_lookup_onlink_v6(ire_t *cire);
68 static	void	ire_report_ftable_v6(ire_t *ire, char *mp);
69 static	void	ire_report_ctable_v6(ire_t *ire, char *mp);
70 static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr,
71     const in6_addr_t *mask, const in6_addr_t *gateway, int type,
72     const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
73     const ts_label_t *tsl, int match_flags);
74 
75 /*
76  * Named Dispatch routine to produce a formatted report on all IREs.
77  * This report is accessed by using the ndd utility to "get" ND variable
78  * "ip_ire_status_v6".
79  */
80 /* ARGSUSED */
81 int
82 ip_ire_report_v6(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
83 {
84 	zoneid_t zoneid;
85 	ip_stack_t *ipst;
86 
87 	(void) mi_mpprintf(mp,
88 	    "IRE      " MI_COL_HDRPAD_STR
89 	    "rfq      " MI_COL_HDRPAD_STR
90 	    "stq      " MI_COL_HDRPAD_STR
91 	    " zone mxfrg rtt   rtt_sd ssthresh ref "
92 	    "rtomax tstamp_ok wscale_ok ecn_ok pmtud_ok sack sendpipe recvpipe "
93 	    "in/out/forward type    addr         mask         "
94 	    "src             gateway");
95 	/*
96 	 *   01234567 01234567 01234567 12345 12345 12345 12345  12345678 123
97 	 *   123456 123456789 123456789 123456 12345678 1234 12345678 12345678
98 	 *   in/out/forward xxxxxxxxxx
99 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
100 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
101 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
102 	 *   xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
103 	 */
104 
105 	/*
106 	 * Because of the ndd constraint, at most we can have 64K buffer
107 	 * to put in all IRE info.  So to be more efficient, just
108 	 * allocate a 64K buffer here, assuming we need that large buffer.
109 	 * This should be OK as only root can do ndd /dev/ip.
110 	 */
111 	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
112 		/* The following may work even if we cannot get a large buf. */
113 		(void) mi_mpprintf(mp, "<< Out of buffer >>\n");
114 		return (0);
115 	}
116 	zoneid = Q_TO_CONN(q)->conn_zoneid;
117 	if (zoneid == GLOBAL_ZONEID)
118 		zoneid = ALL_ZONES;
119 	ipst = CONNQ_TO_IPST(q);
120 
121 	ire_walk_v6(ire_report_ftable_v6, (char *)mp->b_cont, zoneid, ipst);
122 	ire_walk_v6(ire_report_ctable_v6, (char *)mp->b_cont, zoneid, ipst);
123 	return (0);
124 }
125 
126 /*
127  * ire_walk routine invoked for ip_ire_report_v6 for each IRE.
128  */
129 static void
130 ire_report_ftable_v6(ire_t *ire, char *mp)
131 {
132 	char	buf1[INET6_ADDRSTRLEN];
133 	char	buf2[INET6_ADDRSTRLEN];
134 	char	buf3[INET6_ADDRSTRLEN];
135 	char	buf4[INET6_ADDRSTRLEN];
136 	uint_t	fo_pkt_count;
137 	uint_t	ib_pkt_count;
138 	int	ref;
139 	in6_addr_t gw_addr_v6;
140 	uint_t	print_len, buf_len;
141 
142 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
143 	if (ire->ire_type & IRE_CACHETABLE)
144 	    return;
145 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
146 	if (buf_len <= 0)
147 		return;
148 
149 	/* Number of active references of this ire */
150 	ref = ire->ire_refcnt;
151 	/* "inbound" to a non local address is a forward */
152 	ib_pkt_count = ire->ire_ib_pkt_count;
153 	fo_pkt_count = 0;
154 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
155 	if (!(ire->ire_type & (IRE_LOCAL|IRE_BROADCAST))) {
156 		fo_pkt_count = ib_pkt_count;
157 		ib_pkt_count = 0;
158 	}
159 
160 	mutex_enter(&ire->ire_lock);
161 	gw_addr_v6 = ire->ire_gateway_addr_v6;
162 	mutex_exit(&ire->ire_lock);
163 
164 	print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
165 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
166 	    "%05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
167 	    "%04d %08d %08d %d/%d/%d %s\n\t%s\n\t%s\n\t%s\n\t%s\n",
168 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
169 	    (int)ire->ire_zoneid,
170 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
171 	    ire->ire_uinfo.iulp_rtt_sd,
172 	    ire->ire_uinfo.iulp_ssthresh, ref,
173 	    ire->ire_uinfo.iulp_rtomax,
174 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
175 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
176 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
177 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
178 	    ire->ire_uinfo.iulp_sack,
179 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
180 	    ib_pkt_count, ire->ire_ob_pkt_count, fo_pkt_count,
181 	    ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type),
182 	    inet_ntop(AF_INET6, &ire->ire_addr_v6, buf1, sizeof (buf1)),
183 	    inet_ntop(AF_INET6, &ire->ire_mask_v6, buf2, sizeof (buf2)),
184 	    inet_ntop(AF_INET6, &ire->ire_src_addr_v6, buf3, sizeof (buf3)),
185 	    inet_ntop(AF_INET6, &gw_addr_v6, buf4, sizeof (buf4)));
186 	if (print_len < buf_len) {
187 		((mblk_t *)mp)->b_wptr += print_len;
188 	} else {
189 		((mblk_t *)mp)->b_wptr += buf_len;
190 	}
191 }
192 
193 /* ire_walk routine invoked for ip_ire_report_v6 for each IRE. */
194 static void
195 ire_report_ctable_v6(ire_t *ire, char *mp)
196 {
197 	char	buf1[INET6_ADDRSTRLEN];
198 	char	buf2[INET6_ADDRSTRLEN];
199 	char	buf3[INET6_ADDRSTRLEN];
200 	char	buf4[INET6_ADDRSTRLEN];
201 	uint_t	fo_pkt_count;
202 	uint_t	ib_pkt_count;
203 	int	ref;
204 	in6_addr_t gw_addr_v6;
205 	uint_t	print_len, buf_len;
206 
207 	if ((ire->ire_type & IRE_CACHETABLE) == 0)
208 		return;
209 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
210 	if (buf_len <= 0)
211 		return;
212 
213 	/* Number of active references of this ire */
214 	ref = ire->ire_refcnt;
215 	/* "inbound" to a non local address is a forward */
216 	ib_pkt_count = ire->ire_ib_pkt_count;
217 	fo_pkt_count = 0;
218 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
219 	if (ire->ire_type & IRE_LOCAL) {
220 		fo_pkt_count = ib_pkt_count;
221 		ib_pkt_count = 0;
222 	}
223 
224 	mutex_enter(&ire->ire_lock);
225 	gw_addr_v6 = ire->ire_gateway_addr_v6;
226 	mutex_exit(&ire->ire_lock);
227 
228 	print_len =  snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
229 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
230 	    "%05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
231 	    "%04d %08d %08d %d/%d/%d %s\n\t%s\n\t%s\n\t%s\n\t%s\n",
232 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
233 	    (int)ire->ire_zoneid,
234 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
235 	    ire->ire_uinfo.iulp_rtt_sd, ire->ire_uinfo.iulp_ssthresh, ref,
236 	    ire->ire_uinfo.iulp_rtomax,
237 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
238 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
239 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
240 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
241 	    ire->ire_uinfo.iulp_sack,
242 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
243 	    ib_pkt_count, ire->ire_ob_pkt_count,
244 	    fo_pkt_count, ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type),
245 	    inet_ntop(AF_INET6, &ire->ire_addr_v6, buf1, sizeof (buf1)),
246 	    inet_ntop(AF_INET6, &ire->ire_mask_v6, buf2, sizeof (buf2)),
247 	    inet_ntop(AF_INET6, &ire->ire_src_addr_v6, buf3, sizeof (buf3)),
248 	    inet_ntop(AF_INET6, &gw_addr_v6, buf4, sizeof (buf4)));
249 	if (print_len < buf_len) {
250 		((mblk_t *)mp)->b_wptr += print_len;
251 	} else {
252 		((mblk_t *)mp)->b_wptr += buf_len;
253 	}
254 }
255 
256 
257 /*
258  * Initialize the ire that is specific to IPv6 part and call
259  * ire_init_common to finish it.
260  */
261 ire_t *
262 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr,
263     const in6_addr_t *v6mask, const in6_addr_t *v6src_addr,
264     const in6_addr_t *v6gateway, uint_t *max_fragp,
265     mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
266     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
267     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
268     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
269 {
270 
271 	/*
272 	 * Reject IRE security attribute creation/initialization
273 	 * if system is not running in Trusted mode.
274 	 */
275 	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
276 		return (NULL);
277 
278 	if (fp_mp != NULL) {
279 		/*
280 		 * We can't dupb() here as multiple threads could be
281 		 * calling dupb on the same mp which is incorrect.
282 		 * First dupb() should be called only by one thread.
283 		 */
284 		fp_mp = copyb(fp_mp);
285 		if (fp_mp == NULL)
286 			return (NULL);
287 	}
288 
289 	if (dlureq_mp != NULL) {
290 		/*
291 		 * We can't dupb() here as multiple threads could be
292 		 * calling dupb on the same mp which is incorrect.
293 		 * First dupb() should be called only by one thread.
294 		 */
295 		dlureq_mp = copyb(dlureq_mp);
296 		if (dlureq_mp == NULL) {
297 			if (fp_mp != NULL)
298 				freeb(fp_mp);
299 			return (NULL);
300 		}
301 	}
302 
303 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
304 	ire->ire_addr_v6 = *v6addr;
305 
306 	if (v6src_addr != NULL)
307 		ire->ire_src_addr_v6 = *v6src_addr;
308 	if (v6mask != NULL) {
309 		ire->ire_mask_v6 = *v6mask;
310 		ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
311 	}
312 	if (v6gateway != NULL)
313 		ire->ire_gateway_addr_v6 = *v6gateway;
314 
315 	if (type == IRE_CACHE && v6cmask != NULL)
316 		ire->ire_cmask_v6 = *v6cmask;
317 
318 	/*
319 	 * Multirouted packets need to have a fragment header added so that
320 	 * the receiver is able to discard duplicates according to their
321 	 * fragment identifier.
322 	 */
323 	if (type == IRE_CACHE && (flags & RTF_MULTIRT)) {
324 		ire->ire_frag_flag = IPH_FRAG_HDR;
325 	}
326 
327 	/* ire_init_common will free the mblks upon encountering any failure */
328 	if (!ire_init_common(ire, max_fragp, fp_mp, rfq, stq, type, dlureq_mp,
329 	    ipif, NULL, phandle, ihandle, flags, IPV6_VERSION, ulp_info,
330 	    gc, gcgrp, ipst))
331 		return (NULL);
332 
333 	return (ire);
334 }
335 
336 /*
337  * Similar to ire_create_v6 except that it is called only when
338  * we want to allocate ire as an mblk e.g. we have a external
339  * resolver. Do we need this in IPv6 ?
340  */
341 ire_t *
342 ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
343     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
344     mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
345     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
346     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
347     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
348 {
349 	ire_t	*ire;
350 	ire_t	*ret_ire;
351 	mblk_t	*mp;
352 
353 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
354 
355 	/* Allocate the new IRE. */
356 	mp = allocb(sizeof (ire_t), BPRI_MED);
357 	if (mp == NULL) {
358 		ip1dbg(("ire_create_mp_v6: alloc failed\n"));
359 		return (NULL);
360 	}
361 
362 	ire = (ire_t *)mp->b_rptr;
363 	mp->b_wptr = (uchar_t *)&ire[1];
364 
365 	/* Start clean. */
366 	*ire = ire_null;
367 	ire->ire_mp = mp;
368 	mp->b_datap->db_type = IRE_DB_TYPE;
369 
370 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
371 	    NULL, fp_mp, rfq, stq, type, dlureq_mp, ipif, v6cmask, phandle,
372 	    ihandle, flags, ulp_info, gc, gcgrp, ipst);
373 
374 	if (ret_ire == NULL) {
375 		freeb(ire->ire_mp);
376 		return (NULL);
377 	}
378 	return (ire);
379 }
380 
381 /*
382  * ire_create_v6 is called to allocate and initialize a new IRE.
383  *
384  * NOTE : This is called as writer sometimes though not required
385  * by this function.
386  */
387 ire_t *
388 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
389     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
390     uint_t *max_fragp, mblk_t *fp_mp, queue_t *rfq, queue_t *stq, ushort_t type,
391     mblk_t *dlureq_mp, ipif_t *ipif, const in6_addr_t *v6cmask,
392     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
393     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
394 {
395 	ire_t	*ire;
396 	ire_t	*ret_ire;
397 
398 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
399 
400 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
401 	if (ire == NULL) {
402 		ip1dbg(("ire_create_v6: alloc failed\n"));
403 		return (NULL);
404 	}
405 	*ire = ire_null;
406 
407 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
408 	    max_fragp, fp_mp, rfq, stq, type, dlureq_mp, ipif, v6cmask, phandle,
409 	    ihandle, flags, ulp_info, gc, gcgrp, ipst);
410 
411 	if (ret_ire == NULL) {
412 		kmem_cache_free(ire_cache, ire);
413 		return (NULL);
414 	}
415 	ASSERT(ret_ire == ire);
416 	return (ire);
417 }
418 
419 /*
420  * Find an IRE_INTERFACE for the multicast group.
421  * Allows different routes for multicast addresses
422  * in the unicast routing table (akin to FF::0/8 but could be more specific)
423  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
424  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
425  * specify the interface to join on.
426  *
427  * Supports link-local addresses by following the ipif/ill when recursing.
428  */
429 ire_t *
430 ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
431 {
432 	ire_t	*ire;
433 	ipif_t	*ipif = NULL;
434 	int	match_flags = MATCH_IRE_TYPE;
435 	in6_addr_t gw_addr_v6;
436 
437 	ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL,
438 	    zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst);
439 
440 	/* We search a resolvable ire in case of multirouting. */
441 	if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
442 		ire_t *cire = NULL;
443 		/*
444 		 * If the route is not resolvable, the looked up ire
445 		 * may be changed here. In that case, ire_multirt_lookup()
446 		 * IRE_REFRELE the original ire and change it.
447 		 */
448 		(void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW,
449 		    NULL, ipst);
450 		if (cire != NULL)
451 			ire_refrele(cire);
452 	}
453 	if (ire == NULL)
454 		return (NULL);
455 	/*
456 	 * Make sure we follow ire_ipif.
457 	 *
458 	 * We need to determine the interface route through
459 	 * which the gateway will be reached. We don't really
460 	 * care which interface is picked if the interface is
461 	 * part of a group.
462 	 */
463 	if (ire->ire_ipif != NULL) {
464 		ipif = ire->ire_ipif;
465 		match_flags |= MATCH_IRE_ILL_GROUP;
466 	}
467 
468 	switch (ire->ire_type) {
469 	case IRE_DEFAULT:
470 	case IRE_PREFIX:
471 	case IRE_HOST:
472 		mutex_enter(&ire->ire_lock);
473 		gw_addr_v6 = ire->ire_gateway_addr_v6;
474 		mutex_exit(&ire->ire_lock);
475 		ire_refrele(ire);
476 		ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0,
477 		    IRE_INTERFACE, ipif, NULL, zoneid, 0,
478 		    NULL, match_flags, ipst);
479 		return (ire);
480 	case IRE_IF_NORESOLVER:
481 	case IRE_IF_RESOLVER:
482 		return (ire);
483 	default:
484 		ire_refrele(ire);
485 		return (NULL);
486 	}
487 }
488 
489 /*
490  * Return any local address.  We use this to target ourselves
491  * when the src address was specified as 'default'.
492  * Preference for IRE_LOCAL entries.
493  */
494 ire_t *
495 ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst)
496 {
497 	ire_t	*ire;
498 	irb_t	*irb;
499 	ire_t	*maybe = NULL;
500 	int i;
501 
502 	for (i = 0; i < ipst->ips_ip6_cache_table_size;  i++) {
503 		irb = &ipst->ips_ip_cache_table_v6[i];
504 		if (irb->irb_ire == NULL)
505 			continue;
506 		rw_enter(&irb->irb_lock, RW_READER);
507 		for (ire = irb->irb_ire; ire; ire = ire->ire_next) {
508 			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
509 			    ire->ire_zoneid != zoneid &&
510 			    ire->ire_zoneid != ALL_ZONES)
511 				continue;
512 			switch (ire->ire_type) {
513 			case IRE_LOOPBACK:
514 				if (maybe == NULL) {
515 					IRE_REFHOLD(ire);
516 					maybe = ire;
517 				}
518 				break;
519 			case IRE_LOCAL:
520 				if (maybe != NULL) {
521 					ire_refrele(maybe);
522 				}
523 				IRE_REFHOLD(ire);
524 				rw_exit(&irb->irb_lock);
525 				return (ire);
526 			}
527 		}
528 		rw_exit(&irb->irb_lock);
529 	}
530 	return (maybe);
531 }
532 
533 /*
534  * This function takes a mask and returns number of bits set in the
535  * mask (the represented prefix length).  Assumes a contiguous mask.
536  */
537 int
538 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
539 {
540 	int		bits;
541 	int		plen = IPV6_ABITS;
542 	int		i;
543 
544 	for (i = 3; i >= 0; i--) {
545 		if (v6mask->s6_addr32[i] == 0) {
546 			plen -= 32;
547 			continue;
548 		}
549 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
550 		if (bits == 0)
551 			break;
552 		plen -= bits;
553 	}
554 
555 	return (plen);
556 }
557 
558 /*
559  * Convert a prefix length to the mask for that prefix.
560  * Returns the argument bitmask.
561  */
562 in6_addr_t *
563 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
564 {
565 	uint32_t *ptr;
566 
567 	if (plen < 0 || plen > IPV6_ABITS)
568 		return (NULL);
569 	*bitmask = ipv6_all_zeros;
570 
571 	ptr = (uint32_t *)bitmask;
572 	while (plen > 32) {
573 		*ptr++ = 0xffffffffU;
574 		plen -= 32;
575 	}
576 	*ptr = htonl(0xffffffffU << (32 - plen));
577 	return (bitmask);
578 }
579 
580 /*
581  * Add a fully initialized IRE to an appropriate
582  * table based on ire_type.
583  *
584  * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and
585  * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
586  *
587  * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
588  * and IRE_CACHE.
589  *
590  * NOTE : This function is called as writer though not required
591  * by this function.
592  */
593 int
594 ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
595 {
596 	ire_t	*ire1;
597 	int	mask_table_index;
598 	irb_t	*irb_ptr;
599 	ire_t	**irep;
600 	int	flags;
601 	ire_t	*pire = NULL;
602 	ill_t	*stq_ill;
603 	boolean_t	ndp_g_lock_held = B_FALSE;
604 	ire_t	*ire = *ire_p;
605 	int	error;
606 	ip_stack_t	*ipst = ire->ire_ipst;
607 
608 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
609 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
610 	ASSERT(ire->ire_nce == NULL);
611 
612 	/* Find the appropriate list head. */
613 	switch (ire->ire_type) {
614 	case IRE_HOST:
615 		ire->ire_mask_v6 = ipv6_all_ones;
616 		ire->ire_masklen = IPV6_ABITS;
617 		if ((ire->ire_flags & RTF_SETSRC) == 0)
618 			ire->ire_src_addr_v6 = ipv6_all_zeros;
619 		break;
620 	case IRE_CACHE:
621 	case IRE_LOCAL:
622 	case IRE_LOOPBACK:
623 		ire->ire_mask_v6 = ipv6_all_ones;
624 		ire->ire_masklen = IPV6_ABITS;
625 		break;
626 	case IRE_PREFIX:
627 		if ((ire->ire_flags & RTF_SETSRC) == 0)
628 			ire->ire_src_addr_v6 = ipv6_all_zeros;
629 		break;
630 	case IRE_DEFAULT:
631 		if ((ire->ire_flags & RTF_SETSRC) == 0)
632 			ire->ire_src_addr_v6 = ipv6_all_zeros;
633 		break;
634 	case IRE_IF_RESOLVER:
635 	case IRE_IF_NORESOLVER:
636 		break;
637 	default:
638 		printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
639 		    (void *)ire, ire->ire_type);
640 		ire_delete(ire);
641 		*ire_p = NULL;
642 		return (EINVAL);
643 	}
644 
645 	/* Make sure the address is properly masked. */
646 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
647 
648 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
649 		/* IRE goes into Forward Table */
650 		mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
651 		if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) ==
652 		    NULL) {
653 			irb_t *ptr;
654 			int i;
655 
656 			ptr = (irb_t *)mi_zalloc((
657 			    ipst->ips_ip6_ftable_hash_size * sizeof (irb_t)));
658 			if (ptr == NULL) {
659 				ire_delete(ire);
660 				*ire_p = NULL;
661 				return (ENOMEM);
662 			}
663 			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
664 				rw_init(&ptr[i].irb_lock, NULL,
665 				    RW_DEFAULT, NULL);
666 			}
667 			mutex_enter(&ipst->ips_ire_ft_init_lock);
668 			if (ipst->ips_ip_forwarding_table_v6[
669 			    mask_table_index] == NULL) {
670 				ipst->ips_ip_forwarding_table_v6[
671 				    mask_table_index] = ptr;
672 				mutex_exit(&ipst->ips_ire_ft_init_lock);
673 			} else {
674 				/*
675 				 * Some other thread won the race in
676 				 * initializing the forwarding table at the
677 				 * same index.
678 				 */
679 				mutex_exit(&ipst->ips_ire_ft_init_lock);
680 				for (i = 0; i < ipst->ips_ip6_ftable_hash_size;
681 				    i++) {
682 					rw_destroy(&ptr[i].irb_lock);
683 				}
684 				mi_free(ptr);
685 			}
686 		}
687 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
688 		    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
689 		    ipst->ips_ip6_ftable_hash_size)]);
690 	} else {
691 		irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
692 		    ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]);
693 	}
694 	/*
695 	 * For xresolv interfaces (v6 interfaces with an external
696 	 * address resolver), ip_newroute_v6/ip_newroute_ipif_v6
697 	 * are unable to prevent the deletion of the interface route
698 	 * while adding an IRE_CACHE for an on-link destination
699 	 * in the IRE_IF_RESOLVER case, since the ire has to go to
700 	 * the external resolver and return. We can't do a REFHOLD on the
701 	 * associated interface ire for fear of the message being freed
702 	 * if the external resolver can't resolve the address.
703 	 * Here we look up the interface ire in the forwarding table
704 	 * and make sure that the interface route has not been deleted.
705 	 */
706 	if (ire->ire_type == IRE_CACHE &&
707 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) &&
708 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) &&
709 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) {
710 
711 		pire = ire_ihandle_lookup_onlink_v6(ire);
712 		if (pire == NULL) {
713 			ire_delete(ire);
714 			*ire_p = NULL;
715 			return (EINVAL);
716 		}
717 		/* Prevent pire from getting deleted */
718 		IRB_REFHOLD(pire->ire_bucket);
719 		/* Has it been removed already? */
720 		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
721 			IRB_REFRELE(pire->ire_bucket);
722 			ire_refrele(pire);
723 			ire_delete(ire);
724 			*ire_p = NULL;
725 			return (EINVAL);
726 		}
727 	}
728 
729 	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
730 	/*
731 	 * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check
732 	 * for duplicates because :
733 	 *
734 	 * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be
735 	 *    pointing at different ills. A real duplicate is
736 	 *    a match on both ire_ipif and ire_stq.
737 	 *
738 	 * 2) We could have multiple packets trying to create
739 	 *    an IRE_CACHE for the same ill.
740 	 *
741 	 * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants
742 	 * to go out on a particular ill. Rather than looking at the
743 	 * packet, we depend on the above for MATCH_IRE_ILL here.
744 	 *
745 	 * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
746 	 * multiple IRE_CACHES for an ill for the same destination
747 	 * with various scoped addresses i.e represented by ipifs.
748 	 *
749 	 * MATCH_IRE_ILL is done implicitly below for IRE_CACHES.
750 	 */
751 	if (ire->ire_ipif != NULL)
752 		flags |= MATCH_IRE_IPIF;
753 	/*
754 	 * If we are creating hidden ires, make sure we search on
755 	 * this ill (MATCH_IRE_ILL) and a hidden ire, while we are
756 	 * searching for duplicates below. Otherwise we could
757 	 * potentially find an IRE on some other interface
758 	 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
759 	 * shouldn't do this as this will lead to an infinite loop as
760 	 * eventually we need an hidden ire for this packet to go
761 	 * out. MATCH_IRE_ILL is already marked above.
762 	 */
763 	if (ire->ire_marks & IRE_MARK_HIDDEN) {
764 		ASSERT(ire->ire_type == IRE_CACHE);
765 		flags |= MATCH_IRE_MARK_HIDDEN;
766 	}
767 
768 	/*
769 	 * Start the atomic add of the ire. Grab the ill locks,
770 	 * ill_g_usesrc_lock and the bucket lock. Check for condemned.
771 	 * To avoid lock order problems, get the ndp6.ndp_g_lock now itself.
772 	 */
773 	if (ire->ire_type == IRE_CACHE) {
774 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
775 		ndp_g_lock_held = B_TRUE;
776 	}
777 
778 	/*
779 	 * If ipif or ill is changing ire_atomic_start() may queue the
780 	 * request and return EINPROGRESS.
781 	 */
782 
783 	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
784 	if (error != 0) {
785 		if (ndp_g_lock_held)
786 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
787 		/*
788 		 * We don't know whether it is a valid ipif or not.
789 		 * So, set it to NULL. This assumes that the ire has not added
790 		 * a reference to the ipif.
791 		 */
792 		ire->ire_ipif = NULL;
793 		ire_delete(ire);
794 		if (pire != NULL) {
795 			IRB_REFRELE(pire->ire_bucket);
796 			ire_refrele(pire);
797 		}
798 		*ire_p = NULL;
799 		return (error);
800 	}
801 	/*
802 	 * To avoid creating ires having stale values for the ire_max_frag
803 	 * we get the latest value atomically here. For more details
804 	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
805 	 * in ip_rput_dlpi_writer
806 	 */
807 	if (ire->ire_max_fragp == NULL) {
808 		if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6))
809 			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
810 		else
811 			ire->ire_max_frag = pire->ire_max_frag;
812 	} else {
813 		uint_t  max_frag;
814 
815 		max_frag = *ire->ire_max_fragp;
816 		ire->ire_max_fragp = NULL;
817 		ire->ire_max_frag = max_frag;
818 	}
819 
820 	/*
821 	 * Atomically check for duplicate and insert in the table.
822 	 */
823 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
824 		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
825 			continue;
826 
827 		if (ire->ire_type == IRE_CACHE) {
828 			/*
829 			 * We do MATCH_IRE_ILL implicitly here for IRE_CACHES.
830 			 * As ire_ipif and ire_stq could point to two
831 			 * different ills, we can't pass just ire_ipif to
832 			 * ire_match_args and get a match on both ills.
833 			 * This is just needed for duplicate checks here and
834 			 * so we don't add an extra argument to
835 			 * ire_match_args for this. Do it locally.
836 			 *
837 			 * NOTE : Currently there is no part of the code
838 			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
839 			 * match for IRE_CACHEs. Thus we don't want to
840 			 * extend the arguments to ire_match_args_v6.
841 			 */
842 			if (ire1->ire_stq != ire->ire_stq)
843 				continue;
844 			/*
845 			 * Multiroute IRE_CACHEs for a given destination can
846 			 * have the same ire_ipif, typically if their source
847 			 * address is forced using RTF_SETSRC, and the same
848 			 * send-to queue. We differentiate them using the parent
849 			 * handle.
850 			 */
851 			if ((ire1->ire_flags & RTF_MULTIRT) &&
852 			    (ire->ire_flags & RTF_MULTIRT) &&
853 			    (ire1->ire_phandle != ire->ire_phandle))
854 				continue;
855 		}
856 		if (ire1->ire_zoneid != ire->ire_zoneid)
857 			continue;
858 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
859 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
860 		    ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL,
861 		    flags)) {
862 			/*
863 			 * Return the old ire after doing a REFHOLD.
864 			 * As most of the callers continue to use the IRE
865 			 * after adding, we return a held ire. This will
866 			 * avoid a lookup in the caller again. If the callers
867 			 * don't want to use it, they need to do a REFRELE.
868 			 */
869 			ip1dbg(("found dup ire existing %p new %p",
870 			    (void *)ire1, (void *)ire));
871 			IRE_REFHOLD(ire1);
872 			if (ndp_g_lock_held)
873 				mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
874 			ire_atomic_end(irb_ptr, ire);
875 			ire_delete(ire);
876 			if (pire != NULL) {
877 				/*
878 				 * Assert that it is
879 				 * not yet removed from the list.
880 				 */
881 				ASSERT(pire->ire_ptpn != NULL);
882 				IRB_REFRELE(pire->ire_bucket);
883 				ire_refrele(pire);
884 			}
885 			*ire_p = ire1;
886 			return (0);
887 		}
888 	}
889 	if (ire->ire_type == IRE_CACHE) {
890 		in6_addr_t gw_addr_v6;
891 		ill_t	*ill = ire_to_ill(ire);
892 		char	buf[INET6_ADDRSTRLEN];
893 		nce_t	*nce;
894 
895 		/*
896 		 * All IRE_CACHE types must have a nce.  If this is
897 		 * not the case the entry will not be added. We need
898 		 * to make sure that if somebody deletes the nce
899 		 * after we looked up, they will find this ire and
900 		 * delete the ire. To delete this ire one needs the
901 		 * bucket lock which we are still holding here. So,
902 		 * even if the nce gets deleted after we looked up,
903 		 * this ire  will get deleted.
904 		 *
905 		 * NOTE : Don't need the ire_lock for accessing
906 		 * ire_gateway_addr_v6 as it is appearing first
907 		 * time on the list and rts_setgwr_v6 could not
908 		 * be changing this.
909 		 */
910 		gw_addr_v6 = ire->ire_gateway_addr_v6;
911 		if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
912 			nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE);
913 		} else {
914 			nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE);
915 		}
916 		if (nce == NULL)
917 			goto failed;
918 
919 		/* Pair of refhold, refrele just to get the tracing right */
920 		NCE_REFHOLD_TO_REFHOLD_NOTR(nce);
921 		/*
922 		 * Atomically make sure that new IREs don't point
923 		 * to an NCE that is logically deleted (CONDEMNED).
924 		 * ndp_delete() first marks the NCE CONDEMNED.
925 		 * This ensures that the nce_refcnt won't increase
926 		 * due to new nce_lookups or due to addition of new IREs
927 		 * pointing to this NCE. Then ndp_delete() cleans up
928 		 * existing references. If we don't do it atomically here,
929 		 * ndp_delete() -> nce_ire_delete() will not be able to
930 		 * clean up the IRE list completely, and the nce_refcnt
931 		 * won't go down to zero.
932 		 */
933 		mutex_enter(&nce->nce_lock);
934 		if (ill->ill_flags & ILLF_XRESOLV) {
935 			/*
936 			 * If we used an external resolver, we may not
937 			 * have gone through neighbor discovery to get here.
938 			 * Must update the nce_state before the next check.
939 			 */
940 			if (nce->nce_state == ND_INCOMPLETE)
941 				nce->nce_state = ND_REACHABLE;
942 		}
943 		if (nce->nce_state == ND_INCOMPLETE ||
944 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
945 		    (nce->nce_state == ND_UNREACHABLE)) {
946 failed:
947 			if (ndp_g_lock_held)
948 				mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
949 			if (nce != NULL)
950 				mutex_exit(&nce->nce_lock);
951 			ire_atomic_end(irb_ptr, ire);
952 			ip1dbg(("ire_add_v6: No nce for dst %s \n",
953 			    inet_ntop(AF_INET6, &ire->ire_addr_v6,
954 			    buf, sizeof (buf))));
955 			ire_delete(ire);
956 			if (pire != NULL) {
957 				/*
958 				 * Assert that it is
959 				 * not yet removed from the list.
960 				 */
961 				ASSERT(pire->ire_ptpn != NULL);
962 				IRB_REFRELE(pire->ire_bucket);
963 				ire_refrele(pire);
964 			}
965 			if (nce != NULL)
966 				NCE_REFRELE_NOTR(nce);
967 			*ire_p = NULL;
968 			return (EINVAL);
969 		} else {
970 			ire->ire_nce = nce;
971 		}
972 		mutex_exit(&nce->nce_lock);
973 	}
974 	/*
975 	 * Find the first entry that matches ire_addr - provides
976 	 * tail insertion. *irep will be null if no match.
977 	 */
978 	irep = (ire_t **)irb_ptr;
979 	while ((ire1 = *irep) != NULL &&
980 	    !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
981 		irep = &ire1->ire_next;
982 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
983 
984 	if (*irep != NULL) {
985 		/*
986 		 * Find the last ire which matches ire_addr_v6.
987 		 * Needed to do tail insertion among entries with the same
988 		 * ire_addr_v6.
989 		 */
990 		while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
991 		    &ire1->ire_addr_v6)) {
992 			irep = &ire1->ire_next;
993 			ire1 = *irep;
994 			if (ire1 == NULL)
995 				break;
996 		}
997 	}
998 
999 	if (ire->ire_type == IRE_DEFAULT) {
1000 		/*
1001 		 * We keep a count of default gateways which is used when
1002 		 * assigning them as routes.
1003 		 */
1004 		ipst->ips_ipv6_ire_default_count++;
1005 		ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */
1006 	}
1007 	/* Insert at *irep */
1008 	ire1 = *irep;
1009 	if (ire1 != NULL)
1010 		ire1->ire_ptpn = &ire->ire_next;
1011 	ire->ire_next = ire1;
1012 	/* Link the new one in. */
1013 	ire->ire_ptpn = irep;
1014 	/*
1015 	 * ire_walk routines de-reference ire_next without holding
1016 	 * a lock. Before we point to the new ire, we want to make
1017 	 * sure the store that sets the ire_next of the new ire
1018 	 * reaches global visibility, so that ire_walk routines
1019 	 * don't see a truncated list of ires i.e if the ire_next
1020 	 * of the new ire gets set after we do "*irep = ire" due
1021 	 * to re-ordering, the ire_walk thread will see a NULL
1022 	 * once it accesses the ire_next of the new ire.
1023 	 * membar_producer() makes sure that the following store
1024 	 * happens *after* all of the above stores.
1025 	 */
1026 	membar_producer();
1027 	*irep = ire;
1028 	ire->ire_bucket = irb_ptr;
1029 	/*
1030 	 * We return a bumped up IRE above. Keep it symmetrical
1031 	 * so that the callers will always have to release. This
1032 	 * helps the callers of this function because they continue
1033 	 * to use the IRE after adding and hence they don't have to
1034 	 * lookup again after we return the IRE.
1035 	 *
1036 	 * NOTE : We don't have to use atomics as this is appearing
1037 	 * in the list for the first time and no one else can bump
1038 	 * up the reference count on this yet.
1039 	 */
1040 	IRE_REFHOLD_LOCKED(ire);
1041 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
1042 	irb_ptr->irb_ire_cnt++;
1043 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
1044 		irb_ptr->irb_tmp_ire_cnt++;
1045 
1046 	if (ire->ire_ipif != NULL) {
1047 		ire->ire_ipif->ipif_ire_cnt++;
1048 		if (ire->ire_stq != NULL) {
1049 			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
1050 			stq_ill->ill_ire_cnt++;
1051 		}
1052 	} else {
1053 		ASSERT(ire->ire_stq == NULL);
1054 	}
1055 
1056 	if (ndp_g_lock_held)
1057 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1058 	ire_atomic_end(irb_ptr, ire);
1059 
1060 	if (pire != NULL) {
1061 		/* Assert that it is not removed from the list yet */
1062 		ASSERT(pire->ire_ptpn != NULL);
1063 		IRB_REFRELE(pire->ire_bucket);
1064 		ire_refrele(pire);
1065 	}
1066 
1067 	if (ire->ire_type != IRE_CACHE) {
1068 		/*
1069 		 * For ire's with with host mask see if there is an entry
1070 		 * in the cache. If there is one flush the whole cache as
1071 		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
1072 		 * If no entry is found than there is no need to flush the
1073 		 * cache.
1074 		 */
1075 
1076 		if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) {
1077 			ire_t *lire;
1078 			lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL,
1079 			    IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
1080 			    ipst);
1081 			if (lire != NULL) {
1082 				ire_refrele(lire);
1083 				ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
1084 			}
1085 		} else {
1086 			ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
1087 		}
1088 	}
1089 
1090 	*ire_p = ire;
1091 	return (0);
1092 }
1093 
1094 /*
1095  * Search for all HOST REDIRECT routes that are
1096  * pointing at the specified gateway and
1097  * delete them. This routine is called only
1098  * when a default gateway is going away.
1099  */
1100 static void
1101 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
1102 {
1103 	irb_t *irb_ptr;
1104 	irb_t *irb;
1105 	ire_t *ire;
1106 	in6_addr_t gw_addr_v6;
1107 	int i;
1108 
1109 	/* get the hash table for HOST routes */
1110 	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
1111 	if (irb_ptr == NULL)
1112 		return;
1113 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
1114 		irb = &irb_ptr[i];
1115 		IRB_REFHOLD(irb);
1116 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
1117 			if (!(ire->ire_flags & RTF_DYNAMIC))
1118 				continue;
1119 			mutex_enter(&ire->ire_lock);
1120 			gw_addr_v6 = ire->ire_gateway_addr_v6;
1121 			mutex_exit(&ire->ire_lock);
1122 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
1123 				ire_delete(ire);
1124 		}
1125 		IRB_REFRELE(irb);
1126 	}
1127 }
1128 
1129 /*
1130  * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart
1131  * of ip_ire_clookup_and_delete. The difference being this function does not
1132  * return any value. IPv6 processing of a gratuitous ARP, as it stands, is
1133  * different than IPv4 in that, regardless of the presence of a cache entry
1134  * for this address, an ire_walk_v6 is done. Another difference is that unlike
1135  * in the case of IPv4 this does not take an ipif_t argument, since it is only
1136  * called by ip_arp_news and the match is always only on the address.
1137  */
1138 void
1139 ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst)
1140 {
1141 	irb_t		*irb;
1142 	ire_t		*cire;
1143 	boolean_t	found = B_FALSE;
1144 
1145 	irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
1146 					    ipst->ips_ip6_cache_table_size)];
1147 	IRB_REFHOLD(irb);
1148 	for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) {
1149 		if (cire->ire_marks & IRE_MARK_CONDEMNED)
1150 			continue;
1151 		if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) {
1152 
1153 			/* This signifies start of a match */
1154 			if (!found)
1155 				found = B_TRUE;
1156 			if (cire->ire_type == IRE_CACHE) {
1157 				if (cire->ire_nce != NULL)
1158 					ndp_delete(cire->ire_nce);
1159 				ire_delete_v6(cire);
1160 			}
1161 		/* End of the match */
1162 		} else if (found)
1163 			break;
1164 	}
1165 	IRB_REFRELE(irb);
1166 }
1167 
1168 /*
1169  * Delete the specified IRE.
1170  * All calls should use ire_delete().
1171  * Sometimes called as writer though not required by this function.
1172  *
1173  * NOTE : This function is called only if the ire was added
1174  * in the list.
1175  */
1176 void
1177 ire_delete_v6(ire_t *ire)
1178 {
1179 	in6_addr_t gw_addr_v6;
1180 	ip_stack_t	*ipst = ire->ire_ipst;
1181 
1182 	ASSERT(ire->ire_refcnt >= 1);
1183 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
1184 
1185 	if (ire->ire_type != IRE_CACHE)
1186 		ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
1187 	if (ire->ire_type == IRE_DEFAULT) {
1188 		/*
1189 		 * when a default gateway is going away
1190 		 * delete all the host redirects pointing at that
1191 		 * gateway.
1192 		 */
1193 		mutex_enter(&ire->ire_lock);
1194 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1195 		mutex_exit(&ire->ire_lock);
1196 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
1197 	}
1198 }
1199 
1200 /*
1201  * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect
1202  * entries.
1203  */
1204 /*ARGSUSED1*/
1205 void
1206 ire_delete_cache_v6(ire_t *ire, char *arg)
1207 {
1208 	char    addrstr1[INET6_ADDRSTRLEN];
1209 	char    addrstr2[INET6_ADDRSTRLEN];
1210 
1211 	if ((ire->ire_type & IRE_CACHE) ||
1212 	    (ire->ire_flags & RTF_DYNAMIC)) {
1213 		ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n",
1214 		    inet_ntop(AF_INET6, &ire->ire_addr_v6,
1215 			addrstr1, sizeof (addrstr1)),
1216 		    ire->ire_type,
1217 		    inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6,
1218 			addrstr2, sizeof (addrstr2))));
1219 		ire_delete(ire);
1220 	}
1221 
1222 }
1223 
1224 /*
1225  * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries
1226  * that have a given gateway address.
1227  */
1228 void
1229 ire_delete_cache_gw_v6(ire_t *ire, char *addr)
1230 {
1231 	in6_addr_t	*gw_addr = (in6_addr_t *)addr;
1232 	char		buf1[INET6_ADDRSTRLEN];
1233 	char		buf2[INET6_ADDRSTRLEN];
1234 	in6_addr_t	ire_gw_addr_v6;
1235 
1236 	if (!(ire->ire_type & IRE_CACHE) &&
1237 	    !(ire->ire_flags & RTF_DYNAMIC))
1238 		return;
1239 
1240 	mutex_enter(&ire->ire_lock);
1241 	ire_gw_addr_v6 = ire->ire_gateway_addr_v6;
1242 	mutex_exit(&ire->ire_lock);
1243 
1244 	if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) {
1245 		ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n",
1246 		    inet_ntop(AF_INET6, &ire->ire_src_addr_v6,
1247 		    buf1, sizeof (buf1)),
1248 		    ire->ire_type,
1249 		    inet_ntop(AF_INET6, &ire_gw_addr_v6,
1250 		    buf2, sizeof (buf2))));
1251 		ire_delete(ire);
1252 	}
1253 }
1254 
1255 /*
1256  * Remove all IRE_CACHE entries that match
1257  * the ire specified.  (Sometimes called
1258  * as writer though not required by this function.)
1259  *
1260  * The flag argument indicates if the
1261  * flush request is due to addition
1262  * of new route (IRE_FLUSH_ADD) or deletion of old
1263  * route (IRE_FLUSH_DELETE).
1264  *
1265  * This routine takes only the IREs from the forwarding
1266  * table and flushes the corresponding entries from
1267  * the cache table.
1268  *
1269  * When flushing due to the deletion of an old route, it
1270  * just checks the cache handles (ire_phandle and ire_ihandle) and
1271  * deletes the ones that match.
1272  *
1273  * When flushing due to the creation of a new route, it checks
1274  * if a cache entry's address matches the one in the IRE and
1275  * that the cache entry's parent has a less specific mask than the
1276  * one in IRE. The destination of such a cache entry could be the
1277  * gateway for other cache entries, so we need to flush those as
1278  * well by looking for gateway addresses matching the IRE's address.
1279  */
1280 void
1281 ire_flush_cache_v6(ire_t *ire, int flag)
1282 {
1283 	int i;
1284 	ire_t *cire;
1285 	irb_t *irb;
1286 	ip_stack_t	*ipst = ire->ire_ipst;
1287 
1288 	if (ire->ire_type & IRE_CACHE)
1289 	    return;
1290 
1291 	/*
1292 	 * If a default is just created, there is no point
1293 	 * in going through the cache, as there will not be any
1294 	 * cached ires.
1295 	 */
1296 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
1297 		return;
1298 	if (flag == IRE_FLUSH_ADD) {
1299 		/*
1300 		 * This selective flush is
1301 		 * due to the addition of
1302 		 * new IRE.
1303 		 */
1304 		for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
1305 			irb = &ipst->ips_ip_cache_table_v6[i];
1306 			if ((cire = irb->irb_ire) == NULL)
1307 				continue;
1308 			IRB_REFHOLD(irb);
1309 			for (cire = irb->irb_ire; cire != NULL;
1310 			    cire = cire->ire_next) {
1311 				if (cire->ire_type != IRE_CACHE)
1312 					continue;
1313 				/*
1314 				 * If 'cire' belongs to the same subnet
1315 				 * as the new ire being added, and 'cire'
1316 				 * is derived from a prefix that is less
1317 				 * specific than the new ire being added,
1318 				 * we need to flush 'cire'; for instance,
1319 				 * when a new interface comes up.
1320 				 */
1321 				if ((V6_MASK_EQ_2(cire->ire_addr_v6,
1322 				    ire->ire_mask_v6, ire->ire_addr_v6) &&
1323 				    (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <=
1324 				    ire->ire_masklen))) {
1325 					ire_delete(cire);
1326 					continue;
1327 				}
1328 				/*
1329 				 * This is the case when the ire_gateway_addr
1330 				 * of 'cire' belongs to the same subnet as
1331 				 * the new ire being added.
1332 				 * Flushing such ires is sometimes required to
1333 				 * avoid misrouting: say we have a machine with
1334 				 * two interfaces (I1 and I2), a default router
1335 				 * R on the I1 subnet, and a host route to an
1336 				 * off-link destination D with a gateway G on
1337 				 * the I2 subnet.
1338 				 * Under normal operation, we will have an
1339 				 * on-link cache entry for G and an off-link
1340 				 * cache entry for D with G as ire_gateway_addr,
1341 				 * traffic to D will reach its destination
1342 				 * through gateway G.
1343 				 * If the administrator does 'ifconfig I2 down',
1344 				 * the cache entries for D and G will be
1345 				 * flushed. However, G will now be resolved as
1346 				 * an off-link destination using R (the default
1347 				 * router) as gateway. Then D will also be
1348 				 * resolved as an off-link destination using G
1349 				 * as gateway - this behavior is due to
1350 				 * compatibility reasons, see comment in
1351 				 * ire_ihandle_lookup_offlink(). Traffic to D
1352 				 * will go to the router R and probably won't
1353 				 * reach the destination.
1354 				 * The administrator then does 'ifconfig I2 up'.
1355 				 * Since G is on the I2 subnet, this routine
1356 				 * will flush its cache entry. It must also
1357 				 * flush the cache entry for D, otherwise
1358 				 * traffic will stay misrouted until the IRE
1359 				 * times out.
1360 				 */
1361 				if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6,
1362 				    ire->ire_mask_v6, ire->ire_addr_v6)) {
1363 					ire_delete(cire);
1364 					continue;
1365 				}
1366 			}
1367 			IRB_REFRELE(irb);
1368 		}
1369 	} else {
1370 		/*
1371 		 * delete the cache entries based on
1372 		 * handle in the IRE as this IRE is
1373 		 * being deleted/changed.
1374 		 */
1375 		for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
1376 			irb = &ipst->ips_ip_cache_table_v6[i];
1377 			if ((cire = irb->irb_ire) == NULL)
1378 				continue;
1379 			IRB_REFHOLD(irb);
1380 			for (cire = irb->irb_ire; cire != NULL;
1381 			    cire = cire->ire_next) {
1382 				if (cire->ire_type != IRE_CACHE)
1383 					continue;
1384 				if ((cire->ire_phandle == 0 ||
1385 				    cire->ire_phandle != ire->ire_phandle) &&
1386 				    (cire->ire_ihandle == 0 ||
1387 				    cire->ire_ihandle != ire->ire_ihandle))
1388 					continue;
1389 				ire_delete(cire);
1390 			}
1391 			IRB_REFRELE(irb);
1392 		}
1393 	}
1394 }
1395 
1396 /*
1397  * Matches the arguments passed with the values in the ire.
1398  *
1399  * Note: for match types that match using "ipif" passed in, ipif
1400  * must be checked for non-NULL before calling this routine.
1401  */
1402 static boolean_t
1403 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
1404     const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid,
1405     uint32_t ihandle, const ts_label_t *tsl, int match_flags)
1406 {
1407 	in6_addr_t masked_addr;
1408 	in6_addr_t gw_addr_v6;
1409 	ill_t *ire_ill = NULL, *dst_ill;
1410 	ill_t *ipif_ill = NULL;
1411 	ill_group_t *ire_ill_group = NULL;
1412 	ill_group_t *ipif_ill_group = NULL;
1413 	ipif_t	*src_ipif;
1414 
1415 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
1416 	ASSERT(addr != NULL);
1417 	ASSERT(mask != NULL);
1418 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
1419 	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
1420 	    (ipif != NULL && ipif->ipif_isv6));
1421 	ASSERT(!(match_flags & MATCH_IRE_WQ));
1422 
1423 	/*
1424 	 * HIDDEN cache entries have to be looked up specifically with
1425 	 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
1426 	 * when the interface is FAILED or INACTIVE. In that case,
1427 	 * any IRE_CACHES that exists should be marked with
1428 	 * IRE_MARK_HIDDEN. So, we don't really need to match below
1429 	 * for IRE_MARK_HIDDEN. But we do so for consistency.
1430 	 */
1431 	if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
1432 	    (ire->ire_marks & IRE_MARK_HIDDEN))
1433 		return (B_FALSE);
1434 
1435 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
1436 	    ire->ire_zoneid != ALL_ZONES) {
1437 		/*
1438 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
1439 		 * valid and does not match that of ire_zoneid, a failure to
1440 		 * match is reported at this point. Otherwise, since some IREs
1441 		 * that are available in the global zone can be used in local
1442 		 * zones, additional checks need to be performed:
1443 		 *
1444 		 *	IRE_CACHE and IRE_LOOPBACK entries should
1445 		 *	never be matched in this situation.
1446 		 *
1447 		 *	IRE entries that have an interface associated with them
1448 		 *	should in general not match unless they are an IRE_LOCAL
1449 		 *	or in the case when MATCH_IRE_DEFAULT has been set in
1450 		 *	the caller.  In the case of the former, checking of the
1451 		 *	other fields supplied should take place.
1452 		 *
1453 		 *	In the case where MATCH_IRE_DEFAULT has been set,
1454 		 *	all of the ipif's associated with the IRE's ill are
1455 		 *	checked to see if there is a matching zoneid.  If any
1456 		 *	one ipif has a matching zoneid, this IRE is a
1457 		 *	potential candidate so checking of the other fields
1458 		 *	takes place.
1459 		 *
1460 		 *	In the case where the IRE_INTERFACE has a usable source
1461 		 *	address (indicated by ill_usesrc_ifindex) in the
1462 		 *	correct zone then it's permitted to return this IRE
1463 		 */
1464 		if (match_flags & MATCH_IRE_ZONEONLY)
1465 			return (B_FALSE);
1466 		if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK))
1467 			return (B_FALSE);
1468 		/*
1469 		 * Note, IRE_INTERFACE can have the stq as NULL. For
1470 		 * example, if the default multicast route is tied to
1471 		 * the loopback address.
1472 		 */
1473 		if ((ire->ire_type & IRE_INTERFACE) &&
1474 		    (ire->ire_stq != NULL)) {
1475 			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
1476 			/*
1477 			 * If there is a usable source address in the
1478 			 * zone, then it's ok to return an
1479 			 * IRE_INTERFACE
1480 			 */
1481 			if ((dst_ill->ill_usesrc_ifindex != 0) &&
1482 			    (src_ipif = ipif_select_source_v6(dst_ill, addr,
1483 			    RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid))
1484 			    != NULL) {
1485 				ip3dbg(("ire_match_args: src_ipif %p"
1486 				    " dst_ill %p", (void *)src_ipif,
1487 				    (void *)dst_ill));
1488 				ipif_refrele(src_ipif);
1489 			} else {
1490 				ip3dbg(("ire_match_args: src_ipif NULL"
1491 				    " dst_ill %p\n", (void *)dst_ill));
1492 				return (B_FALSE);
1493 			}
1494 		}
1495 		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
1496 		    !(ire->ire_type & IRE_INTERFACE)) {
1497 			ipif_t	*tipif;
1498 
1499 			if ((match_flags & MATCH_IRE_DEFAULT) == 0)
1500 				return (B_FALSE);
1501 			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
1502 			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
1503 			    tipif != NULL; tipif = tipif->ipif_next) {
1504 				if (IPIF_CAN_LOOKUP(tipif) &&
1505 				    (tipif->ipif_flags & IPIF_UP) &&
1506 				    (tipif->ipif_zoneid == zoneid ||
1507 				    tipif->ipif_zoneid == ALL_ZONES))
1508 					break;
1509 			}
1510 			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
1511 			if (tipif == NULL)
1512 				return (B_FALSE);
1513 		}
1514 	}
1515 
1516 	if (match_flags & MATCH_IRE_GW) {
1517 		mutex_enter(&ire->ire_lock);
1518 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1519 		mutex_exit(&ire->ire_lock);
1520 	}
1521 	/*
1522 	 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
1523 	 * somebody wants to send out on a particular interface which
1524 	 * is given by ire_stq and hence use ire_stq to derive the ill
1525 	 * value. ire_ipif for IRE_CACHES is just the
1526 	 * means of getting a source address i.e ire_src_addr_v6 =
1527 	 * ire->ire_ipif->ipif_src_addr_v6.
1528 	 */
1529 	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
1530 		ire_ill = ire_to_ill(ire);
1531 		if (ire_ill != NULL)
1532 			ire_ill_group = ire_ill->ill_group;
1533 		ipif_ill = ipif->ipif_ill;
1534 		ipif_ill_group = ipif_ill->ill_group;
1535 	}
1536 
1537 	/* No ire_addr_v6 bits set past the mask */
1538 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
1539 	    ire->ire_addr_v6));
1540 	V6_MASK_COPY(*addr, *mask, masked_addr);
1541 
1542 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
1543 	    ((!(match_flags & MATCH_IRE_GW)) ||
1544 		IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
1545 	    ((!(match_flags & MATCH_IRE_TYPE)) ||
1546 		(ire->ire_type & type)) &&
1547 	    ((!(match_flags & MATCH_IRE_SRC)) ||
1548 		IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
1549 		&ipif->ipif_v6src_addr)) &&
1550 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
1551 		(ire->ire_ipif == ipif)) &&
1552 	    ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
1553 		(ire->ire_type != IRE_CACHE ||
1554 		ire->ire_marks & IRE_MARK_HIDDEN)) &&
1555 	    ((!(match_flags & MATCH_IRE_ILL)) ||
1556 		(ire_ill == ipif_ill)) &&
1557 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
1558 		(ire->ire_ihandle == ihandle)) &&
1559 	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
1560 		(ire_ill == ipif_ill) ||
1561 		(ire_ill_group != NULL &&
1562 		ire_ill_group == ipif_ill_group)) &&
1563 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
1564 		(!is_system_labeled()) ||
1565 		(tsol_ire_match_gwattr(ire, tsl) == 0))) {
1566 		/* We found the matched IRE */
1567 		return (B_TRUE);
1568 	}
1569 	return (B_FALSE);
1570 }
1571 
1572 /*
1573  * Lookup for a route in all the tables
1574  */
1575 ire_t *
1576 ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1577     const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
1578     zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
1579 {
1580 	ire_t *ire = NULL;
1581 
1582 	/*
1583 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
1584 	 * MATCH_IRE_ILL is set.
1585 	 */
1586 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
1587 	    (ipif == NULL))
1588 		return (NULL);
1589 
1590 	/*
1591 	 * might be asking for a cache lookup,
1592 	 * This is not best way to lookup cache,
1593 	 * user should call ire_cache_lookup directly.
1594 	 *
1595 	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
1596 	 * in the forwarding table, if the applicable type flags were set.
1597 	 */
1598 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
1599 		ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid,
1600 		    tsl, flags, ipst);
1601 		if (ire != NULL)
1602 			return (ire);
1603 	}
1604 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
1605 		ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif,
1606 		    pire, zoneid, 0, tsl, flags, ipst);
1607 	}
1608 	return (ire);
1609 }
1610 
1611 /*
1612  * Lookup a route in forwarding table.
1613  * specific lookup is indicated by passing the
1614  * required parameters and indicating the
1615  * match required in flag field.
1616  *
1617  * Looking for default route can be done in three ways
1618  * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field
1619  *    along with other matches.
1620  * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
1621  *    field along with other matches.
1622  * 3) if the destination and mask are passed as zeros.
1623  *
1624  * A request to return a default route if no route
1625  * is found, can be specified by setting MATCH_IRE_DEFAULT
1626  * in flags.
1627  *
1628  * It does not support recursion more than one level. It
1629  * will do recursive lookup only when the lookup maps to
1630  * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
1631  *
1632  * If the routing table is setup to allow more than one level
1633  * of recursion, the cleaning up cache table will not work resulting
1634  * in invalid routing.
1635  *
1636  * Supports link-local addresses by following the ipif/ill when recursing.
1637  *
1638  * NOTE : When this function returns NULL, pire has already been released.
1639  *	  pire is valid only when this function successfully returns an
1640  *	  ire.
1641  */
1642 ire_t *
1643 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1644     const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
1645     zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags,
1646     ip_stack_t *ipst)
1647 {
1648 	irb_t *irb_ptr;
1649 	ire_t	*rire;
1650 	ire_t *ire = NULL;
1651 	ire_t	*saved_ire;
1652 	nce_t	*nce;
1653 	int i;
1654 	in6_addr_t gw_addr_v6;
1655 
1656 	ASSERT(addr != NULL);
1657 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
1658 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
1659 	ASSERT(ipif == NULL || ipif->ipif_isv6);
1660 	ASSERT(!(flags & MATCH_IRE_WQ));
1661 
1662 	/*
1663 	 * When we return NULL from this function, we should make
1664 	 * sure that *pire is NULL so that the callers will not
1665 	 * wrongly REFRELE the pire.
1666 	 */
1667 	if (pire != NULL)
1668 		*pire = NULL;
1669 	/*
1670 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
1671 	 * MATCH_IRE_ILL is set.
1672 	 */
1673 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
1674 	    (ipif == NULL))
1675 		return (NULL);
1676 
1677 	/*
1678 	 * If the mask is known, the lookup
1679 	 * is simple, if the mask is not known
1680 	 * we need to search.
1681 	 */
1682 	if (flags & MATCH_IRE_MASK) {
1683 		uint_t masklen;
1684 
1685 		masklen = ip_mask_to_plen_v6(mask);
1686 		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL)
1687 			return (NULL);
1688 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
1689 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
1690 			ipst->ips_ip6_ftable_hash_size)]);
1691 		rw_enter(&irb_ptr->irb_lock, RW_READER);
1692 		for (ire = irb_ptr->irb_ire; ire != NULL;
1693 		    ire = ire->ire_next) {
1694 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
1695 				continue;
1696 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
1697 			    ipif, zoneid, ihandle, tsl, flags))
1698 				goto found_ire;
1699 		}
1700 		rw_exit(&irb_ptr->irb_lock);
1701 	} else {
1702 		/*
1703 		 * In this case we don't know the mask, we need to
1704 		 * search the table assuming different mask sizes.
1705 		 * we start with 128 bit mask, we don't allow default here.
1706 		 */
1707 		for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) {
1708 			in6_addr_t tmpmask;
1709 
1710 			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
1711 				continue;
1712 			(void) ip_plen_to_mask_v6(i, &tmpmask);
1713 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
1714 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1715 			    ipst->ips_ip6_ftable_hash_size)];
1716 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1717 			for (ire = irb_ptr->irb_ire; ire != NULL;
1718 			    ire = ire->ire_next) {
1719 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1720 					continue;
1721 				if (ire_match_args_v6(ire, addr,
1722 				    &ire->ire_mask_v6, gateway, type, ipif,
1723 				    zoneid, ihandle, tsl, flags))
1724 					goto found_ire;
1725 			}
1726 			rw_exit(&irb_ptr->irb_lock);
1727 		}
1728 	}
1729 
1730 	/*
1731 	 * We come here if no route has yet been found.
1732 	 *
1733 	 * Handle the case where default route is
1734 	 * requested by specifying type as one of the possible
1735 	 * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE).
1736 	 *
1737 	 * If MATCH_IRE_MASK is specified, then the appropriate default route
1738 	 * would have been found above if it exists so it isn't looked up here.
1739 	 * If MATCH_IRE_DEFAULT was also specified, then a default route will be
1740 	 * searched for later.
1741 	 */
1742 	if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE &&
1743 	    (type & (IRE_DEFAULT | IRE_INTERFACE))) {
1744 		if (ipst->ips_ip_forwarding_table_v6[0] != NULL) {
1745 			/* addr & mask is zero for defaults */
1746 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[0][
1747 			    IRE_ADDR_HASH_V6(ipv6_all_zeros,
1748 			    ipst->ips_ip6_ftable_hash_size)];
1749 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1750 			for (ire = irb_ptr->irb_ire; ire != NULL;
1751 			    ire = ire->ire_next) {
1752 
1753 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1754 					continue;
1755 
1756 				if (ire_match_args_v6(ire, addr,
1757 				    &ipv6_all_zeros, gateway, type, ipif,
1758 				    zoneid, ihandle, tsl, flags))
1759 					goto found_ire;
1760 			}
1761 			rw_exit(&irb_ptr->irb_lock);
1762 		}
1763 	}
1764 	/*
1765 	 * We come here only if no route is found.
1766 	 * see if the default route can be used which is allowed
1767 	 * only if the default matching criteria is specified.
1768 	 * The ipv6_ire_default_count tracks the number of IRE_DEFAULT
1769 	 * entries. However, the ip_forwarding_table_v6[0] also contains
1770 	 * interface routes thus the count can be zero.
1771 	 */
1772 	saved_ire = NULL;
1773 	if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
1774 	    MATCH_IRE_DEFAULT) {
1775 		ire_t	*ire_origin;
1776 		uint_t	g_index;
1777 		uint_t	index;
1778 
1779 		if (ipst->ips_ip_forwarding_table_v6[0] == NULL)
1780 			return (NULL);
1781 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[0])[0];
1782 
1783 		/*
1784 		 * Keep a tab on the bucket while looking the IRE_DEFAULT
1785 		 * entries. We need to keep track of a particular IRE
1786 		 * (ire_origin) so this ensures that it will not be unlinked
1787 		 * from the hash list during the recursive lookup below.
1788 		 */
1789 		IRB_REFHOLD(irb_ptr);
1790 		ire = irb_ptr->irb_ire;
1791 		if (ire == NULL) {
1792 			IRB_REFRELE(irb_ptr);
1793 			return (NULL);
1794 		}
1795 
1796 		/*
1797 		 * Get the index first, since it can be changed by other
1798 		 * threads. Then get to the right default route skipping
1799 		 * default interface routes if any. As we hold a reference on
1800 		 * the IRE bucket, ipv6_ire_default_count can only increase so
1801 		 * we can't reach the end of the hash list unexpectedly.
1802 		 */
1803 		if (ipst->ips_ipv6_ire_default_count != 0) {
1804 			g_index = ipst->ips_ipv6_ire_default_index++;
1805 			index = g_index % ipst->ips_ipv6_ire_default_count;
1806 			while (index != 0) {
1807 				if (!(ire->ire_type & IRE_INTERFACE))
1808 					index--;
1809 				ire = ire->ire_next;
1810 			}
1811 			ASSERT(ire != NULL);
1812 		} else {
1813 			/*
1814 			 * No default route, so we only have default interface
1815 			 * routes: don't enter the first loop.
1816 			 */
1817 			ire = NULL;
1818 		}
1819 
1820 		/*
1821 		 * Round-robin the default routers list looking for a neighbor
1822 		 * that matches the passed in parameters and is reachable.  If
1823 		 * none found, just return a route from the default router list
1824 		 * if it exists. If we can't find a default route (IRE_DEFAULT),
1825 		 * look for interface default routes.
1826 		 * We start with the ire we found above and we walk the hash
1827 		 * list until we're back where we started, see
1828 		 * ire_get_next_default_ire(). It doesn't matter if default
1829 		 * routes are added or deleted by other threads - we know this
1830 		 * ire will stay in the list because we hold a reference on the
1831 		 * ire bucket.
1832 		 * NB: if we only have interface default routes, ire is NULL so
1833 		 * we don't even enter this loop (see above).
1834 		 */
1835 		ire_origin = ire;
1836 		for (; ire != NULL;
1837 		    ire = ire_get_next_default_ire(ire, ire_origin)) {
1838 
1839 			if (ire_match_args_v6(ire, addr,
1840 			    &ipv6_all_zeros, gateway, type, ipif,
1841 			    zoneid, ihandle, tsl, flags)) {
1842 				int match_flags;
1843 
1844 				/*
1845 				 * We have something to work with.
1846 				 * If we can find a resolved/reachable
1847 				 * entry, we will use this. Otherwise
1848 				 * we'll try to find an entry that has
1849 				 * a resolved cache entry. We will fallback
1850 				 * on this if we don't find anything else.
1851 				 */
1852 				if (saved_ire == NULL)
1853 					saved_ire = ire;
1854 				mutex_enter(&ire->ire_lock);
1855 				gw_addr_v6 = ire->ire_gateway_addr_v6;
1856 				mutex_exit(&ire->ire_lock);
1857 				match_flags = MATCH_IRE_ILL_GROUP |
1858 				    MATCH_IRE_SECATTR;
1859 				rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
1860 				    0, ire->ire_ipif, zoneid, tsl, match_flags,
1861 				    ipst);
1862 				if (rire != NULL) {
1863 					nce = rire->ire_nce;
1864 					if (nce != NULL &&
1865 					    NCE_ISREACHABLE(nce) &&
1866 					    nce->nce_flags & NCE_F_ISROUTER) {
1867 						ire_refrele(rire);
1868 						IRE_REFHOLD(ire);
1869 						IRB_REFRELE(irb_ptr);
1870 						goto found_ire_held;
1871 					} else if (nce != NULL &&
1872 					    !(nce->nce_flags &
1873 					    NCE_F_ISROUTER)) {
1874 						/*
1875 						 * Make sure we don't use
1876 						 * this ire
1877 						 */
1878 						if (saved_ire == ire)
1879 							saved_ire = NULL;
1880 					}
1881 					ire_refrele(rire);
1882 				} else if (ipst->
1883 				    ips_ipv6_ire_default_count > 1 &&
1884 				    zoneid != GLOBAL_ZONEID) {
1885 					/*
1886 					 * When we're in a local zone, we're
1887 					 * only interested in default routers
1888 					 * that are reachable through ipifs
1889 					 * within our zone.
1890 					 * The potentially expensive call to
1891 					 * ire_route_lookup_v6() is avoided when
1892 					 * we have only one default route.
1893 					 */
1894 					int ire_match_flags = MATCH_IRE_TYPE |
1895 					    MATCH_IRE_SECATTR;
1896 
1897 					if (ire->ire_ipif != NULL) {
1898 						ire_match_flags |=
1899 						    MATCH_IRE_ILL_GROUP;
1900 					}
1901 					rire = ire_route_lookup_v6(&gw_addr_v6,
1902 					    NULL, NULL, IRE_INTERFACE,
1903 					    ire->ire_ipif, NULL,
1904 					    zoneid, tsl, ire_match_flags, ipst);
1905 					if (rire != NULL) {
1906 						ire_refrele(rire);
1907 						saved_ire = ire;
1908 					} else if (saved_ire == ire) {
1909 						/*
1910 						 * Make sure we don't use
1911 						 * this ire
1912 						 */
1913 						saved_ire = NULL;
1914 					}
1915 				}
1916 			}
1917 		}
1918 		if (saved_ire != NULL) {
1919 			ire = saved_ire;
1920 			IRE_REFHOLD(ire);
1921 			IRB_REFRELE(irb_ptr);
1922 			goto found_ire_held;
1923 		} else {
1924 			/*
1925 			 * Look for a interface default route matching the
1926 			 * args passed in. No round robin here. Just pick
1927 			 * the right one.
1928 			 */
1929 			for (ire = irb_ptr->irb_ire; ire != NULL;
1930 			    ire = ire->ire_next) {
1931 
1932 				if (!(ire->ire_type & IRE_INTERFACE))
1933 					continue;
1934 
1935 				if (ire->ire_marks & IRE_MARK_CONDEMNED)
1936 					continue;
1937 
1938 				if (ire_match_args_v6(ire, addr,
1939 				    &ipv6_all_zeros, gateway, type, ipif,
1940 				    zoneid, ihandle, tsl, flags)) {
1941 					IRE_REFHOLD(ire);
1942 					IRB_REFRELE(irb_ptr);
1943 					goto found_ire_held;
1944 				}
1945 			}
1946 			IRB_REFRELE(irb_ptr);
1947 		}
1948 	}
1949 	ASSERT(ire == NULL);
1950 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1951 	return (NULL);
1952 found_ire:
1953 	ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0);
1954 	IRE_REFHOLD(ire);
1955 	rw_exit(&irb_ptr->irb_lock);
1956 
1957 found_ire_held:
1958 	if ((flags & MATCH_IRE_RJ_BHOLE) &&
1959 	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
1960 		return (ire);
1961 	}
1962 	/*
1963 	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
1964 	 * or IRE_CACHETABLE type.  If this is a recursive lookup and an
1965 	 * IRE_INTERFACE type was found, return that.  If it was some other
1966 	 * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it
1967 	 * is necessary to fill in the  parent IRE pointed to by pire, and
1968 	 * then lookup the gateway address of  the parent.  For backwards
1969 	 * compatiblity, if this lookup returns an
1970 	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
1971 	 * of lookup is done.
1972 	 */
1973 	if (flags & MATCH_IRE_RECURSIVE) {
1974 		const ipif_t *gw_ipif;
1975 		int match_flags = MATCH_IRE_DSTONLY;
1976 
1977 		if (ire->ire_type & IRE_INTERFACE)
1978 			return (ire);
1979 		if (pire != NULL)
1980 			*pire = ire;
1981 		/*
1982 		 * If we can't find an IRE_INTERFACE or the caller has not
1983 		 * asked for pire, we need to REFRELE the saved_ire.
1984 		 */
1985 		saved_ire = ire;
1986 
1987 		/*
1988 		 * Currently MATCH_IRE_ILL is never used with
1989 		 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
1990 		 * sending out packets as MATCH_IRE_ILL is used only
1991 		 * for communicating with on-link hosts. We can't assert
1992 		 * that here as RTM_GET calls this function with
1993 		 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
1994 		 * We have already used the MATCH_IRE_ILL in determining
1995 		 * the right prefix route at this point. To match the
1996 		 * behavior of how we locate routes while sending out
1997 		 * packets, we don't want to use MATCH_IRE_ILL below
1998 		 * while locating the interface route.
1999 		 */
2000 		if (ire->ire_ipif != NULL)
2001 			match_flags |= MATCH_IRE_ILL_GROUP;
2002 
2003 		mutex_enter(&ire->ire_lock);
2004 		gw_addr_v6 = ire->ire_gateway_addr_v6;
2005 		mutex_exit(&ire->ire_lock);
2006 
2007 		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0,
2008 		    ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
2009 		if (ire == NULL) {
2010 			/*
2011 			 * In this case we have to deal with the
2012 			 * MATCH_IRE_PARENT flag, which means the
2013 			 * parent has to be returned if ire is NULL.
2014 			 * The aim of this is to have (at least) a starting
2015 			 * ire when we want to look at all of the ires in a
2016 			 * bucket aimed at a single destination (as is the
2017 			 * case in ip_newroute_v6 for the RTF_MULTIRT
2018 			 * flagged routes).
2019 			 */
2020 			if (flags & MATCH_IRE_PARENT) {
2021 				if (pire != NULL) {
2022 					/*
2023 					 * Need an extra REFHOLD, if the
2024 					 * parent ire is returned via both
2025 					 * ire and pire.
2026 					 */
2027 					IRE_REFHOLD(saved_ire);
2028 				}
2029 				ire = saved_ire;
2030 			} else {
2031 				ire_refrele(saved_ire);
2032 				if (pire != NULL)
2033 					*pire = NULL;
2034 			}
2035 			return (ire);
2036 		}
2037 		if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
2038 			/*
2039 			 * If the caller did not ask for pire, release
2040 			 * it now.
2041 			 */
2042 			if (pire == NULL) {
2043 				ire_refrele(saved_ire);
2044 			}
2045 			return (ire);
2046 		}
2047 		match_flags |= MATCH_IRE_TYPE;
2048 		mutex_enter(&ire->ire_lock);
2049 		gw_addr_v6 = ire->ire_gateway_addr_v6;
2050 		mutex_exit(&ire->ire_lock);
2051 		gw_ipif = ire->ire_ipif;
2052 		ire_refrele(ire);
2053 		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL,
2054 		    (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid,
2055 		    NULL, match_flags, ipst);
2056 		if (ire == NULL) {
2057 			/*
2058 			 * In this case we have to deal with the
2059 			 * MATCH_IRE_PARENT flag, which means the
2060 			 * parent has to be returned if ire is NULL.
2061 			 * The aim of this is to have (at least) a starting
2062 			 * ire when we want to look at all of the ires in a
2063 			 * bucket aimed at a single destination (as is the
2064 			 * case in ip_newroute_v6 for the RTF_MULTIRT
2065 			 * flagged routes).
2066 			 */
2067 			if (flags & MATCH_IRE_PARENT) {
2068 				if (pire != NULL) {
2069 					/*
2070 					 * Need an extra REFHOLD, if the
2071 					 * parent ire is returned via both
2072 					 * ire and pire.
2073 					 */
2074 					IRE_REFHOLD(saved_ire);
2075 				}
2076 				ire = saved_ire;
2077 			} else {
2078 				ire_refrele(saved_ire);
2079 				if (pire != NULL)
2080 					*pire = NULL;
2081 			}
2082 			return (ire);
2083 		} else if (pire == NULL) {
2084 			/*
2085 			 * If the caller did not ask for pire, release
2086 			 * it now.
2087 			 */
2088 			ire_refrele(saved_ire);
2089 		}
2090 		return (ire);
2091 	}
2092 
2093 	ASSERT(pire == NULL || *pire == NULL);
2094 	return (ire);
2095 }
2096 
2097 /*
2098  * Delete the IRE cache for the gateway and all IRE caches whose
2099  * ire_gateway_addr_v6 points to this gateway, and allow them to
2100  * be created on demand by ip_newroute_v6.
2101  */
2102 void
2103 ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid,
2104 	ip_stack_t *ipst)
2105 {
2106 	irb_t *irb;
2107 	ire_t *ire;
2108 
2109 	irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2110 	    ipst->ips_ip6_cache_table_size)];
2111 	IRB_REFHOLD(irb);
2112 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2113 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
2114 			continue;
2115 
2116 		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
2117 		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0,
2118 		    IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) {
2119 			ire_delete(ire);
2120 		}
2121 	}
2122 	IRB_REFRELE(irb);
2123 
2124 	ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid, ipst);
2125 }
2126 
2127 /*
2128  * Looks up cache table for a route.
2129  * specific lookup can be indicated by
2130  * passing the MATCH_* flags and the
2131  * necessary parameters.
2132  */
2133 ire_t *
2134 ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
2135     int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl,
2136     int flags, ip_stack_t *ipst)
2137 {
2138 	ire_t *ire;
2139 	irb_t *irb_ptr;
2140 	ASSERT(addr != NULL);
2141 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
2142 
2143 	/*
2144 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
2145 	 * MATCH_IRE_ILL is set.
2146 	 */
2147 	if ((flags & (MATCH_IRE_SRC |  MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
2148 	    (ipif == NULL))
2149 		return (NULL);
2150 
2151 	irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2152 				    ipst->ips_ip6_cache_table_size)];
2153 	rw_enter(&irb_ptr->irb_lock, RW_READER);
2154 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
2155 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
2156 			continue;
2157 
2158 		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
2159 		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, gateway,
2160 		    type, ipif, zoneid, 0, tsl, flags)) {
2161 			IRE_REFHOLD(ire);
2162 			rw_exit(&irb_ptr->irb_lock);
2163 			return (ire);
2164 		}
2165 	}
2166 	rw_exit(&irb_ptr->irb_lock);
2167 	return (NULL);
2168 }
2169 
2170 /*
2171  * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
2172  * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
2173  * to the hidden ones.
2174  *
2175  * In general the zoneid has to match (where ALL_ZONES match all of them).
2176  * But for IRE_LOCAL we also need to handle the case where L2 should
2177  * conceptually loop back the packet. This is necessary since neither
2178  * Ethernet drivers nor Ethernet hardware loops back packets sent to their
2179  * own MAC address. This loopback is needed when the normal
2180  * routes (ignoring IREs with different zoneids) would send out the packet on
2181  * the same ill (or ill group) as the ill with which this IRE_LOCAL is
2182  * associated.
2183  *
2184  * Earlier versions of this code always matched an IRE_LOCAL independently of
2185  * the zoneid. We preserve that earlier behavior when
2186  * ip_restrict_interzone_loopback is turned off.
2187  */
2188 ire_t *
2189 ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
2190     const ts_label_t *tsl, ip_stack_t *ipst)
2191 {
2192 	irb_t *irb_ptr;
2193 	ire_t *ire;
2194 
2195 	irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
2196 	    ipst->ips_ip6_cache_table_size)];
2197 	rw_enter(&irb_ptr->irb_lock, RW_READER);
2198 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
2199 		if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
2200 			continue;
2201 		if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
2202 			/*
2203 			 * Finally, check if the security policy has any
2204 			 * restriction on using this route for the specified
2205 			 * message.
2206 			 */
2207 			if (tsl != NULL &&
2208 			    ire->ire_gw_secattr != NULL &&
2209 			    tsol_ire_match_gwattr(ire, tsl) != 0) {
2210 				continue;
2211 			}
2212 
2213 			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
2214 			    ire->ire_zoneid == ALL_ZONES) {
2215 				IRE_REFHOLD(ire);
2216 				rw_exit(&irb_ptr->irb_lock);
2217 				return (ire);
2218 			}
2219 
2220 			if (ire->ire_type == IRE_LOCAL) {
2221 				if (ipst->ips_ip_restrict_interzone_loopback &&
2222 				    !ire_local_ok_across_zones(ire, zoneid,
2223 				    (void *)addr, tsl, ipst))
2224 					continue;
2225 
2226 				IRE_REFHOLD(ire);
2227 				rw_exit(&irb_ptr->irb_lock);
2228 				return (ire);
2229 			}
2230 		}
2231 	}
2232 	rw_exit(&irb_ptr->irb_lock);
2233 	return (NULL);
2234 }
2235 
2236 /*
2237  * Locate the interface ire that is tied to the cache ire 'cire' via
2238  * cire->ire_ihandle.
2239  *
2240  * We are trying to create the cache ire for an onlink destn. or
2241  * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER
2242  * case for xresolv interfaces, after the ire has come back from
2243  * an external resolver.
2244  */
2245 static ire_t *
2246 ire_ihandle_lookup_onlink_v6(ire_t *cire)
2247 {
2248 	ire_t	*ire;
2249 	int	match_flags;
2250 	int	i;
2251 	int	j;
2252 	irb_t	*irb_ptr;
2253 	ip_stack_t	*ipst = cire->ire_ipst;
2254 
2255 	ASSERT(cire != NULL);
2256 
2257 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
2258 	/*
2259 	 * We know that the mask of the interface ire equals cire->ire_cmask.
2260 	 * (When ip_newroute_v6() created 'cire' for an on-link destn.
2261 	 * it set its cmask from the interface ire's mask)
2262 	 */
2263 	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6,
2264 	    NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
2265 	    NULL, match_flags, ipst);
2266 	if (ire != NULL)
2267 		return (ire);
2268 	/*
2269 	 * If we didn't find an interface ire above, we can't declare failure.
2270 	 * For backwards compatibility, we need to support prefix routes
2271 	 * pointing to next hop gateways that are not on-link.
2272 	 *
2273 	 * In the resolver/noresolver case, ip_newroute_v6() thinks
2274 	 * it is creating the cache ire for an onlink destination in 'cire'.
2275 	 * But 'cire' is not actually onlink, because ire_ftable_lookup_v6()
2276 	 * cheated it, by doing ire_route_lookup_v6() twice and returning an
2277 	 * interface ire.
2278 	 *
2279 	 * Eg. default	-	gw1			(line 1)
2280 	 *	gw1	-	gw2			(line 2)
2281 	 *	gw2	-	hme0			(line 3)
2282 	 *
2283 	 * In the above example, ip_newroute_v6() tried to create the cache ire
2284 	 * 'cire' for gw1, based on the interface route in line 3. The
2285 	 * ire_ftable_lookup_v6() above fails, because there is
2286 	 * no interface route to reach gw1. (it is gw2). We fall thru below.
2287 	 *
2288 	 * Do a brute force search based on the ihandle in a subset of the
2289 	 * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise
2290 	 * things become very complex, since we don't have 'pire' in this
2291 	 * case. (Also note that this method is not possible in the offlink
2292 	 * case because we don't know the mask)
2293 	 */
2294 	i = ip_mask_to_plen_v6(&cire->ire_cmask_v6);
2295 	if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
2296 		return (NULL);
2297 	for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
2298 		irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][j];
2299 		rw_enter(&irb_ptr->irb_lock, RW_READER);
2300 		for (ire = irb_ptr->irb_ire; ire != NULL;
2301 		    ire = ire->ire_next) {
2302 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
2303 				continue;
2304 			if ((ire->ire_type & IRE_INTERFACE) &&
2305 			    (ire->ire_ihandle == cire->ire_ihandle)) {
2306 				IRE_REFHOLD(ire);
2307 				rw_exit(&irb_ptr->irb_lock);
2308 				return (ire);
2309 			}
2310 		}
2311 		rw_exit(&irb_ptr->irb_lock);
2312 	}
2313 	return (NULL);
2314 }
2315 
2316 
2317 /*
2318  * Locate the interface ire that is tied to the cache ire 'cire' via
2319  * cire->ire_ihandle.
2320  *
2321  * We are trying to create the cache ire for an offlink destn based
2322  * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
2323  * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in
2324  * the IRE_CACHE case.
2325  */
2326 ire_t *
2327 ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
2328 {
2329 	ire_t	*ire;
2330 	int	match_flags;
2331 	in6_addr_t	gw_addr;
2332 	ipif_t		*gw_ipif;
2333 	ip_stack_t	*ipst = cire->ire_ipst;
2334 
2335 	ASSERT(cire != NULL && pire != NULL);
2336 
2337 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
2338 	/*
2339 	 * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only
2340 	 * for on-link hosts. We should never be here for onlink.
2341 	 * Thus, use MATCH_IRE_ILL_GROUP.
2342 	 */
2343 	if (pire->ire_ipif != NULL)
2344 		match_flags |= MATCH_IRE_ILL_GROUP;
2345 	/*
2346 	 * We know that the mask of the interface ire equals cire->ire_cmask.
2347 	 * (When ip_newroute_v6() created 'cire' for an on-link destn. it set
2348 	 * its cmask from the interface ire's mask)
2349 	 */
2350 	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0,
2351 	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
2352 	    NULL, match_flags, ipst);
2353 	if (ire != NULL)
2354 		return (ire);
2355 	/*
2356 	 * If we didn't find an interface ire above, we can't declare failure.
2357 	 * For backwards compatibility, we need to support prefix routes
2358 	 * pointing to next hop gateways that are not on-link.
2359 	 *
2360 	 * Assume we are trying to ping some offlink destn, and we have the
2361 	 * routing table below.
2362 	 *
2363 	 * Eg.	default	- gw1		<--- pire	(line 1)
2364 	 *	gw1	- gw2				(line 2)
2365 	 *	gw2	- hme0				(line 3)
2366 	 *
2367 	 * If we already have a cache ire for gw1 in 'cire', the
2368 	 * ire_ftable_lookup_v6 above would have failed, since there is no
2369 	 * interface ire to reach gw1. We will fallthru below.
2370 	 *
2371 	 * Here we duplicate the steps that ire_ftable_lookup_v6() did in
2372 	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
2373 	 * The differences are the following
2374 	 * i.   We want the interface ire only, so we call
2375 	 *	ire_ftable_lookup_v6() instead of ire_route_lookup_v6()
2376 	 * ii.  We look for only prefix routes in the 1st call below.
2377 	 * ii.  We want to match on the ihandle in the 2nd call below.
2378 	 */
2379 	match_flags =  MATCH_IRE_TYPE;
2380 	if (pire->ire_ipif != NULL)
2381 		match_flags |= MATCH_IRE_ILL_GROUP;
2382 
2383 	mutex_enter(&pire->ire_lock);
2384 	gw_addr = pire->ire_gateway_addr_v6;
2385 	mutex_exit(&pire->ire_lock);
2386 	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET,
2387 	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
2388 	if (ire == NULL)
2389 		return (NULL);
2390 	/*
2391 	 * At this point 'ire' corresponds to the entry shown in line 2.
2392 	 * gw_addr is 'gw2' in the example above.
2393 	 */
2394 	mutex_enter(&ire->ire_lock);
2395 	gw_addr = ire->ire_gateway_addr_v6;
2396 	mutex_exit(&ire->ire_lock);
2397 	gw_ipif = ire->ire_ipif;
2398 	ire_refrele(ire);
2399 
2400 	match_flags |= MATCH_IRE_IHANDLE;
2401 	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE,
2402 	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
2403 	    NULL, match_flags, ipst);
2404 	return (ire);
2405 }
2406 
2407 /*
2408  * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
2409  * ire associated with the specified ipif.
2410  *
2411  * This might occasionally be called when IPIF_UP is not set since
2412  * the IPV6_MULTICAST_IF as well as creating interface routes
2413  * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
2414  *
2415  * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
2416  * the ipif this routine might return NULL.
2417  * (Sometimes called as writer though not required by this function.)
2418  */
2419 ire_t *
2420 ipif_to_ire_v6(const ipif_t *ipif)
2421 {
2422 	ire_t	*ire;
2423 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2424 
2425 	ASSERT(ipif->ipif_isv6);
2426 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
2427 		ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
2428 		    IRE_LOOPBACK, ipif, ALL_ZONES, NULL,
2429 		    (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ipst);
2430 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
2431 		/* In this case we need to lookup destination address. */
2432 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
2433 		    &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
2434 		    0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
2435 		    MATCH_IRE_MASK), ipst);
2436 	} else {
2437 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
2438 		    &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
2439 		    ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
2440 		    MATCH_IRE_MASK), ipst);
2441 	}
2442 	return (ire);
2443 }
2444 
2445 /*
2446  * Return B_TRUE if a multirt route is resolvable
2447  * (or if no route is resolved yet), B_FALSE otherwise.
2448  * This only works in the global zone.
2449  */
2450 boolean_t
2451 ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl,
2452     ip_stack_t *ipst)
2453 {
2454 	ire_t	*first_fire;
2455 	ire_t	*first_cire;
2456 	ire_t	*fire;
2457 	ire_t	*cire;
2458 	irb_t	*firb;
2459 	irb_t	*cirb;
2460 	int	unres_cnt = 0;
2461 	boolean_t resolvable = B_FALSE;
2462 
2463 	/* Retrieve the first IRE_HOST that matches the destination */
2464 	first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST,
2465 	    NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE |
2466 	    MATCH_IRE_SECATTR, ipst);
2467 
2468 	/* No route at all */
2469 	if (first_fire == NULL) {
2470 		return (B_TRUE);
2471 	}
2472 
2473 	firb = first_fire->ire_bucket;
2474 	ASSERT(firb);
2475 
2476 	/* Retrieve the first IRE_CACHE ire for that destination. */
2477 	first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl, ipst);
2478 
2479 	/* No resolved route. */
2480 	if (first_cire == NULL) {
2481 		ire_refrele(first_fire);
2482 		return (B_TRUE);
2483 	}
2484 
2485 	/* At least one route is resolved. */
2486 
2487 	cirb = first_cire->ire_bucket;
2488 	ASSERT(cirb);
2489 
2490 	/* Count the number of routes to that dest that are declared. */
2491 	IRB_REFHOLD(firb);
2492 	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
2493 		if (!(fire->ire_flags & RTF_MULTIRT))
2494 			continue;
2495 		if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp))
2496 			continue;
2497 		unres_cnt++;
2498 	}
2499 	IRB_REFRELE(firb);
2500 
2501 
2502 	/* Then subtract the number of routes to that dst that are resolved */
2503 	IRB_REFHOLD(cirb);
2504 	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
2505 	    if (!(cire->ire_flags & RTF_MULTIRT))
2506 		continue;
2507 	    if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
2508 		continue;
2509 	    if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
2510 		continue;
2511 	    unres_cnt--;
2512 	}
2513 	IRB_REFRELE(cirb);
2514 
2515 	/* At least one route is unresolved; search for a resolvable route. */
2516 	if (unres_cnt > 0)
2517 		resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire,
2518 		    MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl, ipst);
2519 
2520 	if (first_fire)
2521 		ire_refrele(first_fire);
2522 
2523 	if (first_cire)
2524 		ire_refrele(first_cire);
2525 
2526 	return (resolvable);
2527 }
2528 
2529 
2530 /*
2531  * Return B_TRUE and update *ire_arg and *fire_arg
2532  * if at least one resolvable route is found.
2533  * Return B_FALSE otherwise (all routes are resolved or
2534  * the remaining unresolved routes are all unresolvable).
2535  * This only works in the global zone.
2536  */
2537 boolean_t
2538 ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
2539     const ts_label_t *tsl, ip_stack_t *ipst)
2540 {
2541 	clock_t	delta;
2542 	ire_t	*best_fire = NULL;
2543 	ire_t	*best_cire = NULL;
2544 	ire_t	*first_fire;
2545 	ire_t	*first_cire;
2546 	ire_t	*fire;
2547 	ire_t	*cire;
2548 	irb_t	*firb = NULL;
2549 	irb_t	*cirb = NULL;
2550 	ire_t	*gw_ire;
2551 	boolean_t	already_resolved;
2552 	boolean_t	res;
2553 	in6_addr_t	v6dst;
2554 	in6_addr_t	v6gw;
2555 
2556 	ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, "
2557 	    "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags));
2558 
2559 	ASSERT(ire_arg);
2560 	ASSERT(fire_arg);
2561 
2562 	/* Not an IRE_HOST ire; give up. */
2563 	if ((*fire_arg == NULL) ||
2564 	    ((*fire_arg)->ire_type != IRE_HOST)) {
2565 		return (B_FALSE);
2566 	}
2567 
2568 	/* This is the first IRE_HOST ire for that destination. */
2569 	first_fire = *fire_arg;
2570 	firb = first_fire->ire_bucket;
2571 	ASSERT(firb);
2572 
2573 	mutex_enter(&first_fire->ire_lock);
2574 	v6dst = first_fire->ire_addr_v6;
2575 	mutex_exit(&first_fire->ire_lock);
2576 
2577 	ip2dbg(("ire_multirt_lookup_v6: dst %08x\n",
2578 	    ntohl(V4_PART_OF_V6(v6dst))));
2579 
2580 	/*
2581 	 * Retrieve the first IRE_CACHE ire for that destination;
2582 	 * if we don't find one, no route for that dest is
2583 	 * resolved yet.
2584 	 */
2585 	first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl, ipst);
2586 	if (first_cire) {
2587 		cirb = first_cire->ire_bucket;
2588 	}
2589 
2590 	ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire));
2591 
2592 	/*
2593 	 * Search for a resolvable route, giving the top priority
2594 	 * to routes that can be resolved without any call to the resolver.
2595 	 */
2596 	IRB_REFHOLD(firb);
2597 
2598 	if (!IN6_IS_ADDR_MULTICAST(&v6dst)) {
2599 		/*
2600 		 * For all multiroute IRE_HOST ires for that destination,
2601 		 * check if the route via the IRE_HOST's gateway is
2602 		 * resolved yet.
2603 		 */
2604 		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
2605 
2606 			if (!(fire->ire_flags & RTF_MULTIRT))
2607 				continue;
2608 			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
2609 				continue;
2610 
2611 			if (fire->ire_gw_secattr != NULL &&
2612 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
2613 				continue;
2614 			}
2615 
2616 			mutex_enter(&fire->ire_lock);
2617 			v6gw = fire->ire_gateway_addr_v6;
2618 			mutex_exit(&fire->ire_lock);
2619 
2620 			ip2dbg(("ire_multirt_lookup_v6: fire %p, "
2621 			    "ire_addr %08x, ire_gateway_addr %08x\n",
2622 			    (void *)fire,
2623 			    ntohl(V4_PART_OF_V6(fire->ire_addr_v6)),
2624 			    ntohl(V4_PART_OF_V6(v6gw))));
2625 
2626 			already_resolved = B_FALSE;
2627 
2628 			if (first_cire) {
2629 				ASSERT(cirb);
2630 
2631 				IRB_REFHOLD(cirb);
2632 				/*
2633 				 * For all IRE_CACHE ires for that
2634 				 * destination.
2635 				 */
2636 				for (cire = first_cire;
2637 				    cire != NULL;
2638 				    cire = cire->ire_next) {
2639 
2640 					if (!(cire->ire_flags & RTF_MULTIRT))
2641 						continue;
2642 					if (!IN6_ARE_ADDR_EQUAL(
2643 					    &cire->ire_addr_v6, &v6dst))
2644 						continue;
2645 					if (cire->ire_marks &
2646 					    (IRE_MARK_CONDEMNED|
2647 						IRE_MARK_HIDDEN))
2648 						continue;
2649 
2650 					if (cire->ire_gw_secattr != NULL &&
2651 					    tsol_ire_match_gwattr(cire,
2652 					    tsl) != 0) {
2653 						continue;
2654 					}
2655 
2656 					/*
2657 					 * Check if the IRE_CACHE's gateway
2658 					 * matches the IRE_HOST's gateway.
2659 					 */
2660 					if (IN6_ARE_ADDR_EQUAL(
2661 					    &cire->ire_gateway_addr_v6,
2662 					    &v6gw)) {
2663 						already_resolved = B_TRUE;
2664 						break;
2665 					}
2666 				}
2667 				IRB_REFRELE(cirb);
2668 			}
2669 
2670 			/*
2671 			 * This route is already resolved;
2672 			 * proceed with next one.
2673 			 */
2674 			if (already_resolved) {
2675 				ip2dbg(("ire_multirt_lookup_v6: found cire %p, "
2676 				    "already resolved\n", (void *)cire));
2677 				continue;
2678 			}
2679 
2680 			/*
2681 			 * The route is unresolved; is it actually
2682 			 * resolvable, i.e. is there a cache or a resolver
2683 			 * for the gateway?
2684 			 */
2685 			gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL,
2686 			    ALL_ZONES, tsl, MATCH_IRE_RECURSIVE |
2687 			    MATCH_IRE_SECATTR, ipst);
2688 
2689 			ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n",
2690 			    (void *)gw_ire));
2691 
2692 			/*
2693 			 * This route can be resolved without any call to the
2694 			 * resolver; if the MULTIRT_CACHEGW flag is set,
2695 			 * give the top priority to this ire and exit the
2696 			 * loop.
2697 			 * This occurs when an resolver reply is processed
2698 			 * through ip_wput_nondata()
2699 			 */
2700 			if ((flags & MULTIRT_CACHEGW) &&
2701 			    (gw_ire != NULL) &&
2702 			    (gw_ire->ire_type & IRE_CACHETABLE)) {
2703 				/*
2704 				 * Release the resolver associated to the
2705 				 * previous candidate best ire, if any.
2706 				 */
2707 				if (best_cire) {
2708 					ire_refrele(best_cire);
2709 					ASSERT(best_fire);
2710 				}
2711 
2712 				best_fire = fire;
2713 				best_cire = gw_ire;
2714 
2715 				ip2dbg(("ire_multirt_lookup_v6: found top prio "
2716 				    "best_fire %p, best_cire %p\n",
2717 				    (void *)best_fire, (void *)best_cire));
2718 				break;
2719 			}
2720 
2721 			/*
2722 			 * Compute the time elapsed since our preceding
2723 			 * attempt to  resolve that route.
2724 			 * If the MULTIRT_USESTAMP flag is set, we take that
2725 			 * route into account only if this time interval
2726 			 * exceeds ip_multirt_resolution_interval;
2727 			 * this prevents us from attempting to resolve a
2728 			 * broken route upon each sending of a packet.
2729 			 */
2730 			delta = lbolt - fire->ire_last_used_time;
2731 			delta = TICK_TO_MSEC(delta);
2732 
2733 			res = (boolean_t)
2734 			    ((delta > ipst->
2735 				ips_ip_multirt_resolution_interval) ||
2736 			    (!(flags & MULTIRT_USESTAMP)));
2737 
2738 			ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, "
2739 			    "res %d\n",
2740 			    (void *)fire, delta, res));
2741 
2742 			if (res) {
2743 				/*
2744 				 * A resolver exists for the gateway: save
2745 				 * the current IRE_HOST ire as a candidate
2746 				 * best ire. If we later discover that a
2747 				 * top priority ire exists (i.e. no need to
2748 				 * call the resolver), then this new ire
2749 				 * will be preferred to the current one.
2750 				 */
2751 				if (gw_ire != NULL) {
2752 					if (best_fire == NULL) {
2753 						ASSERT(best_cire == NULL);
2754 
2755 						best_fire = fire;
2756 						best_cire = gw_ire;
2757 
2758 						ip2dbg(("ire_multirt_lookup_v6:"
2759 						    "found candidate "
2760 						    "best_fire %p, "
2761 						    "best_cire %p\n",
2762 						    (void *)best_fire,
2763 						    (void *)best_cire));
2764 
2765 						/*
2766 						 * If MULTIRT_CACHEGW is not
2767 						 * set, we ignore the top
2768 						 * priority ires that can
2769 						 * be resolved without any
2770 						 * call to the resolver;
2771 						 * In that case, there is
2772 						 * actually no need
2773 						 * to continue the loop.
2774 						 */
2775 						if (!(flags &
2776 						    MULTIRT_CACHEGW)) {
2777 							break;
2778 						}
2779 						continue;
2780 					}
2781 				} else {
2782 					/*
2783 					 * No resolver for the gateway: the
2784 					 * route is not resolvable.
2785 					 * If the MULTIRT_SETSTAMP flag is
2786 					 * set, we stamp the IRE_HOST ire,
2787 					 * so we will not select it again
2788 					 * during this resolution interval.
2789 					 */
2790 					if (flags & MULTIRT_SETSTAMP)
2791 						fire->ire_last_used_time =
2792 						    lbolt;
2793 				}
2794 			}
2795 
2796 			if (gw_ire != NULL)
2797 				ire_refrele(gw_ire);
2798 		}
2799 	} else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */
2800 
2801 		for (fire = first_fire;
2802 		    fire != NULL;
2803 		    fire = fire->ire_next) {
2804 
2805 			if (!(fire->ire_flags & RTF_MULTIRT))
2806 				continue;
2807 			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
2808 				continue;
2809 
2810 			if (fire->ire_gw_secattr != NULL &&
2811 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
2812 				continue;
2813 			}
2814 
2815 			already_resolved = B_FALSE;
2816 
2817 			mutex_enter(&fire->ire_lock);
2818 			v6gw = fire->ire_gateway_addr_v6;
2819 			mutex_exit(&fire->ire_lock);
2820 
2821 			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
2822 			    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl,
2823 			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
2824 			    MATCH_IRE_SECATTR, ipst);
2825 
2826 			/* No resolver for the gateway; we skip this ire. */
2827 			if (gw_ire == NULL) {
2828 				continue;
2829 			}
2830 
2831 			if (first_cire) {
2832 
2833 				IRB_REFHOLD(cirb);
2834 				/*
2835 				 * For all IRE_CACHE ires for that
2836 				 * destination.
2837 				 */
2838 				for (cire = first_cire;
2839 				    cire != NULL;
2840 				    cire = cire->ire_next) {
2841 
2842 					if (!(cire->ire_flags & RTF_MULTIRT))
2843 						continue;
2844 					if (!IN6_ARE_ADDR_EQUAL(
2845 					    &cire->ire_addr_v6, &v6dst))
2846 						continue;
2847 					if (cire->ire_marks &
2848 					    (IRE_MARK_CONDEMNED|
2849 						IRE_MARK_HIDDEN))
2850 						continue;
2851 
2852 					if (cire->ire_gw_secattr != NULL &&
2853 					    tsol_ire_match_gwattr(cire,
2854 					    tsl) != 0) {
2855 						continue;
2856 					}
2857 
2858 					/*
2859 					 * Cache entries are linked to the
2860 					 * parent routes using the parent handle
2861 					 * (ire_phandle). If no cache entry has
2862 					 * the same handle as fire, fire is
2863 					 * still unresolved.
2864 					 */
2865 					ASSERT(cire->ire_phandle != 0);
2866 					if (cire->ire_phandle ==
2867 					    fire->ire_phandle) {
2868 						already_resolved = B_TRUE;
2869 						break;
2870 					}
2871 				}
2872 				IRB_REFRELE(cirb);
2873 			}
2874 
2875 			/*
2876 			 * This route is already resolved; proceed with
2877 			 * next one.
2878 			 */
2879 			if (already_resolved) {
2880 				ire_refrele(gw_ire);
2881 				continue;
2882 			}
2883 
2884 			/*
2885 			 * Compute the time elapsed since our preceding
2886 			 * attempt to resolve that route.
2887 			 * If the MULTIRT_USESTAMP flag is set, we take
2888 			 * that route into account only if this time
2889 			 * interval exceeds ip_multirt_resolution_interval;
2890 			 * this prevents us from attempting to resolve a
2891 			 * broken route upon each sending of a packet.
2892 			 */
2893 			delta = lbolt - fire->ire_last_used_time;
2894 			delta = TICK_TO_MSEC(delta);
2895 
2896 			res = (boolean_t)
2897 			    ((delta > ipst->
2898 				ips_ip_multirt_resolution_interval) ||
2899 			    (!(flags & MULTIRT_USESTAMP)));
2900 
2901 			ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, "
2902 			    "flags %04x, res %d\n",
2903 			    (void *)fire, delta, flags, res));
2904 
2905 			if (res) {
2906 				if (best_cire) {
2907 					/*
2908 					 * Release the resolver associated
2909 					 * to the preceding candidate best
2910 					 * ire, if any.
2911 					 */
2912 					ire_refrele(best_cire);
2913 					ASSERT(best_fire);
2914 				}
2915 				best_fire = fire;
2916 				best_cire = gw_ire;
2917 				continue;
2918 			}
2919 
2920 			ire_refrele(gw_ire);
2921 		}
2922 	}
2923 
2924 	if (best_fire) {
2925 		IRE_REFHOLD(best_fire);
2926 	}
2927 	IRB_REFRELE(firb);
2928 
2929 	/* Release the first IRE_CACHE we initially looked up, if any. */
2930 	if (first_cire)
2931 		ire_refrele(first_cire);
2932 
2933 	/* Found a resolvable route. */
2934 	if (best_fire) {
2935 		ASSERT(best_cire);
2936 
2937 		if (*fire_arg)
2938 			ire_refrele(*fire_arg);
2939 		if (*ire_arg)
2940 			ire_refrele(*ire_arg);
2941 
2942 		/*
2943 		 * Update the passed arguments with the
2944 		 * resolvable multirt route we found
2945 		 */
2946 		*fire_arg = best_fire;
2947 		*ire_arg = best_cire;
2948 
2949 		ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, "
2950 		    "*fire_arg %p, *ire_arg %p\n",
2951 		    (void *)best_fire, (void *)best_cire));
2952 
2953 		return (B_TRUE);
2954 	}
2955 
2956 	ASSERT(best_cire == NULL);
2957 
2958 	ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, "
2959 	    "*ire_arg %p\n",
2960 	    (void *)*fire_arg, (void *)*ire_arg));
2961 
2962 	/* No resolvable route. */
2963 	return (B_FALSE);
2964 }
2965 
2966 
2967 /*
2968  * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp'
2969  * that goes through 'ipif'. As a fallback, a route that goes through
2970  * ipif->ipif_ill can be returned.
2971  */
2972 ire_t *
2973 ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp)
2974 {
2975 	ire_t	*ire;
2976 	ire_t	*save_ire = NULL;
2977 	ire_t   *gw_ire;
2978 	irb_t   *irb;
2979 	in6_addr_t v6gw;
2980 	int	match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
2981 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2982 
2983 	ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
2984 	    NULL, MATCH_IRE_DEFAULT, ipst);
2985 
2986 	if (ire == NULL)
2987 		return (NULL);
2988 
2989 	irb = ire->ire_bucket;
2990 	ASSERT(irb);
2991 
2992 	IRB_REFHOLD(irb);
2993 	ire_refrele(ire);
2994 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2995 		if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) ||
2996 		    (ipif->ipif_zoneid != ire->ire_zoneid &&
2997 		    ire->ire_zoneid != ALL_ZONES)) {
2998 			continue;
2999 		}
3000 
3001 		switch (ire->ire_type) {
3002 		case IRE_DEFAULT:
3003 		case IRE_PREFIX:
3004 		case IRE_HOST:
3005 			mutex_enter(&ire->ire_lock);
3006 			v6gw = ire->ire_gateway_addr_v6;
3007 			mutex_exit(&ire->ire_lock);
3008 			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
3009 			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
3010 			    NULL, match_flags, ipst);
3011 
3012 			if (gw_ire != NULL) {
3013 				if (save_ire != NULL) {
3014 					ire_refrele(save_ire);
3015 				}
3016 				IRE_REFHOLD(ire);
3017 				if (gw_ire->ire_ipif == ipif) {
3018 					ire_refrele(gw_ire);
3019 
3020 					IRB_REFRELE(irb);
3021 					return (ire);
3022 				}
3023 				ire_refrele(gw_ire);
3024 				save_ire = ire;
3025 			}
3026 			break;
3027 		case IRE_IF_NORESOLVER:
3028 		case IRE_IF_RESOLVER:
3029 			if (ire->ire_ipif == ipif) {
3030 				if (save_ire != NULL) {
3031 					ire_refrele(save_ire);
3032 				}
3033 				IRE_REFHOLD(ire);
3034 
3035 				IRB_REFRELE(irb);
3036 				return (ire);
3037 			}
3038 			break;
3039 		}
3040 	}
3041 	IRB_REFRELE(irb);
3042 
3043 	return (save_ire);
3044 }
3045